[llvm] expandFMINIMUMNUM_FMAXIMUMNUM: Improve compare between zeros (PR #140193)

YunQiang Su via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 2 20:40:22 PST 2025


https://github.com/wzssyqa updated https://github.com/llvm/llvm-project/pull/140193

>From 5735e205de45ade9ed6e4d9d3cceecb3b97be1f4 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Wed, 3 Dec 2025 11:08:01 +0800
Subject: [PATCH 1/5] expandFMINIMUMNUM_FMAXIMUMNUM: Improve compare between
 zeros

1. On GPR32 platform, expandIS_FPCLASS may fail due to ISD::BITCAST
double to int64 may fail. Let's FP_ROUND double to float first.
Since we use it if MinMax is zero only, so the flushing won't
break anything.

2. Only one IS_FPCLASS is needed. MinMax will always be RHS if equal.
So we can select between LHS and MinMax.
It will even safe if FP_ROUND flush a small LHS, as if LHS is not zero
then, MinMax won't be Zero, so we will always use MinMax.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   20 +-
 llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll  |  992 ++-
 llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll  |  437 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll   | 5743 +++++++--------
 llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll   | 6443 +++++++----------
 .../CodeGen/Mips/fp-maximumnum-minimumnum.ll  |  763 +-
 6 files changed, 6551 insertions(+), 7847 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 783ec4b0bd211..15a20b54b5e07 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8852,6 +8852,7 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
     RHS = DAG.getSelectCC(DL, RHS, RHS, LHS, RHS, ISD::SETUO);
   }
 
+  // Please always prefer RHS if equal.
   SDValue MinMax =
       DAG.getSelectCC(DL, LHS, RHS, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
 
@@ -8866,13 +8867,20 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
       DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
   SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
                                 DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
-  SDValue LCmp = DAG.getSelect(
-      DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
+  EVT IntVT = VT.changeTypeToInteger();
+  EVT FloatVT = VT.changeElementType(MVT::f32);
+  SDValue LHSTrunc = LHS;
+  if (!isOperationLegal(ISD::BITCAST, IntVT) &&
+      !isOperationLegal(ISD::IS_FPCLASS, VT)) {
+    LHSTrunc = DAG.getNode(ISD::FP_ROUND, DL, FloatVT, LHS,
+                           DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+  }
+  // It's OK to select from LHS and MinMax, with only one ISD::IS_FPCLASS, as
+  // we preferred RHS when generate MinMax, if the operands are equal.
+  SDValue RetZero = DAG.getSelect(
+      DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHSTrunc, TestZero), LHS,
       MinMax, Flags);
-  SDValue RCmp = DAG.getSelect(
-      DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, LCmp,
-      Flags);
-  return DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
+  return DAG.getSelect(DL, VT, IsZero, RetZero, MinMax, Flags);
 }
 
 /// Returns a true value if if this FPClassTest can be performed with an ordered
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index 35150f2d43942..2465c1eef6b10 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1713,14 +1713,12 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -1731,12 +1729,10 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1752,14 +1748,12 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -1770,12 +1764,10 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1789,44 +1781,33 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1842,15 +1823,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
@@ -1859,14 +1838,12 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1880,87 +1857,76 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v3.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
 ; GFX11-SDAG-FAKE16:       ; %bb.0:
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1980,49 +1946,37 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v3.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -2033,58 +1987,51 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call bfloat @llvm.maximumnum.bf16(bfloat %a, bfloat %b)
   %max3 = call bfloat @llvm.maximumnum.bf16(bfloat %tmp0, bfloat %c)
@@ -2155,15 +2102,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2172,15 +2117,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
@@ -2191,12 +2134,10 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2207,15 +2148,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2232,15 +2171,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2249,15 +2186,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
@@ -2268,12 +2203,10 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2284,14 +2217,12 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2306,69 +2237,52 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v4
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2376,22 +2290,17 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2409,6 +2318,7 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
@@ -2416,30 +2326,25 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -2447,27 +2352,23 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2476,87 +2377,68 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v4, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v3.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v5.h, s2
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v5.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v2.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0:
@@ -2567,82 +2449,72 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2655,102 +2527,81 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v4, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v5.h, s2
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v5.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v2.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v2.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0:
@@ -2766,102 +2617,87 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index 16553afd7c700..ca660a9eec137 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -1713,15 +1713,13 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -1732,12 +1730,10 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1753,15 +1749,13 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
-; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-SDAG-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -1772,12 +1766,10 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1787,48 +1779,37 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1844,15 +1825,13 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
@@ -1861,14 +1840,12 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2157,16 +2134,14 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2175,15 +2150,13 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
@@ -2194,12 +2167,10 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2210,15 +2181,13 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2235,16 +2204,14 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
-; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-SDAG-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v4
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2253,15 +2220,13 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
@@ -2272,12 +2237,10 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2288,14 +2251,12 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-SDAG-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2310,69 +2271,53 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v4
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2380,23 +2325,17 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-SDAG-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2414,6 +2353,7 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
@@ -2421,30 +2361,25 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -2452,27 +2387,23 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2572,82 +2503,72 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index 2385035c02073..f944686a96cd7 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -33,14 +33,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximumnum_bf16:
@@ -55,14 +53,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximumnum_bf16:
@@ -75,22 +71,17 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_bf16:
@@ -105,14 +96,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_bf16:
@@ -156,17 +145,15 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_bf16:
@@ -263,14 +250,12 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximumnum_bf16_nnan:
@@ -278,15 +263,13 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e64 s[4:5], v3, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v2
+; GFX900-NEXT:    s_and_b64 vcc, s[4:5], vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximumnum_bf16_nnan:
@@ -294,19 +277,14 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e64 s[0:1], v3, v2
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[0:1]
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[0:1], 0, v2
+; GFX950-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_bf16_nnan:
@@ -315,14 +293,12 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_bf16_nnan:
@@ -352,15 +328,13 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_bf16_nnan:
@@ -451,15 +425,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -468,15 +440,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -493,15 +463,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -510,14 +478,12 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -532,45 +498,35 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -588,6 +544,7 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
@@ -595,24 +552,19 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v2bf16:
@@ -671,40 +623,37 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v2bf16:
@@ -856,26 +805,21 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v3, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_sdwa v1, v1, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -886,26 +830,21 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v1, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -916,65 +855,48 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
+; GFX950-NEXT:    v_cndmask_b32_sdwa v1, v1, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2bf16_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v5, v4
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v1, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v5
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1167,15 +1089,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1184,15 +1104,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -1201,14 +1119,12 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1226,15 +1142,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1243,15 +1157,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -1260,14 +1172,12 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1282,68 +1192,53 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1351,58 +1246,52 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v0, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v9, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v3bf16:
@@ -1478,59 +1367,56 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16:
@@ -1731,38 +1617,32 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1773,38 +1653,32 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1815,92 +1689,71 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3bf16_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -2149,15 +2002,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -2168,15 +2019,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2185,15 +2034,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -2202,14 +2049,12 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -2229,15 +2074,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -2248,15 +2091,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2265,15 +2106,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -2282,14 +2121,12 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v4, v1, s4
@@ -2305,93 +2142,73 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4bf16:
@@ -2400,75 +2217,67 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_sdwa v11, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v7, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v8, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v8
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v9
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2562,80 +2371,78 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2887,52 +2694,43 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -2946,50 +2744,42 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
@@ -3001,68 +2791,48 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
-; GFX950-NEXT:    v_perm_b32 v1, v1, v4, s0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT:    v_perm_b32 v1, v1, v4, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3070,53 +2840,45 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v12, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v13
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v4
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v13, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3434,15 +3196,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -3453,15 +3213,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
@@ -3472,15 +3230,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v8
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3489,15 +3245,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -3506,15 +3260,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -3523,14 +3275,12 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
@@ -3553,15 +3303,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -3572,15 +3320,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
@@ -3591,15 +3337,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v8
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3608,15 +3352,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -3625,15 +3367,13 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -3642,14 +3382,12 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v8, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v7, v1, s4
@@ -3668,140 +3406,110 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v6
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v7
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v8
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-NEXT:    v_perm_b32 v2, v6, v2, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    v_perm_b32 v1, v7, v1, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v8, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v7, v1, s0
+; GFX950-NEXT:    v_perm_b32 v2, v6, v2, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v6bf16:
@@ -3809,113 +3517,101 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_sdwa v12, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_sdwa v14, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v10, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v15, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v10, v12, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v14, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v8, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v11, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v9, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v9, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v11, v11
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v13, v11
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v15, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v10
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v11
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
 ; GFX10-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v2, v7, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4047,114 +3743,113 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v12, 16, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v15, v14 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v14, v10 :: v_dual_lshlrev_b32 v15, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v12, v10, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v9, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v13, v12
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v15, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v10, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4530,15 +4225,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v8
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -4549,15 +4242,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v9
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
@@ -4568,15 +4259,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v10
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -4587,15 +4276,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v11
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4604,15 +4291,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
@@ -4621,15 +4306,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -4638,15 +4321,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
@@ -4655,14 +4336,12 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
@@ -4687,15 +4366,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v8
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -4706,15 +4383,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v9
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
@@ -4725,15 +4400,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v10
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -4744,15 +4417,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v11
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4761,15 +4432,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
@@ -4778,15 +4447,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -4795,15 +4462,13 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
@@ -4812,14 +4477,12 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v11, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v10, v1, s4
@@ -4839,188 +4502,148 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v8
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
-; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v9
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
-; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v10
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v11
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
-; GFX950-NEXT:    v_perm_b32 v3, v8, v3, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    v_perm_b32 v2, v9, v2, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-NEXT:    v_perm_b32 v1, v10, v1, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v11, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v10, v1, s0
+; GFX950-NEXT:    v_perm_b32 v2, v9, v2, s0
+; GFX950-NEXT:    v_perm_b32 v3, v8, v3, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v8bf16:
@@ -5029,151 +4652,135 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v12, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v18, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v17, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v13
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX10-NEXT:    s_and_b32 s5, s7, s8
 ; GFX10-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s5
+; GFX10-NEXT:    s_and_b32 s5, s9, s10
 ; GFX10-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s5
+; GFX10-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v8bf16:
@@ -6002,15 +5609,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v17
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
@@ -6021,15 +5626,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v18
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
@@ -6040,15 +5643,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v19
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
@@ -6059,15 +5660,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v20
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
@@ -6078,15 +5677,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v21
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
@@ -6097,15 +5694,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v22
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
@@ -6116,15 +5711,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v23
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
@@ -6135,15 +5728,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v24
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6152,15 +5743,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
@@ -6169,15 +5758,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
@@ -6186,15 +5773,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
@@ -6203,15 +5788,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
@@ -6220,15 +5803,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
@@ -6237,15 +5818,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
@@ -6254,15 +5833,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -6271,14 +5848,12 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
@@ -6311,15 +5886,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v17
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
@@ -6330,15 +5903,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v18
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
@@ -6349,15 +5920,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v19
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
@@ -6368,15 +5937,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v20
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
@@ -6387,15 +5954,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v21
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
@@ -6406,15 +5971,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v22
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
@@ -6425,15 +5988,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v23
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
@@ -6444,15 +6005,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v24
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6461,15 +6020,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
@@ -6478,15 +6035,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
@@ -6495,15 +6050,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
@@ -6512,15 +6065,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
@@ -6529,15 +6080,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
@@ -6546,15 +6095,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
@@ -6563,15 +6110,13 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -6580,14 +6125,12 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v23, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v22, v1, s4
@@ -6611,377 +6154,297 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v16
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v19
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
-; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
-; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
-; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
-; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v17
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
+; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v18
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
-; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v19
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
-; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v20
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
-; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v22, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v21
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
-; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v23, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v22
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
-; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v24, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v23
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v23, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v7
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
-; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX950-NEXT:    v_perm_b32 v7, v16, v7, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v6
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX950-NEXT:    v_perm_b32 v6, v17, v6, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v14
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX950-NEXT:    v_perm_b32 v5, v18, v5, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v4
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v13
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX950-NEXT:    v_perm_b32 v4, v19, v4, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX950-NEXT:    v_perm_b32 v3, v20, v3, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v11
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX950-NEXT:    v_perm_b32 v2, v21, v2, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v10
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX950-NEXT:    v_perm_b32 v1, v22, v1, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v23, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v22, v1, s0
+; GFX950-NEXT:    v_perm_b32 v2, v21, v2, s0
+; GFX950-NEXT:    v_perm_b32 v3, v20, v3, s0
+; GFX950-NEXT:    v_perm_b32 v4, v19, v4, s0
+; GFX950-NEXT:    v_perm_b32 v5, v18, v5, s0
+; GFX950-NEXT:    v_perm_b32 v6, v17, v6, s0
+; GFX950-NEXT:    v_perm_b32 v7, v16, v7, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v16bf16:
@@ -8943,48 +8406,51 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v51, v50, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v48, v48
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v31
+; GFX8-NEXT:    v_cndmask_b32_e64 v38, v38, v35, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX8-NEXT:    v_cmp_gt_f32_e64 s[6:7], v39, v48
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v32, v31, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_f32_e64 s[6:7], v36, v49
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v38, v35, s[6:7]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v32
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[6:7], 0, v38
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v35
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[8:9], 0, v39
+; GFX8-NEXT:    s_and_b64 vcc, s[6:7], vcc
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX8-NEXT:    s_and_b64 vcc, s[8:9], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v36, v35, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
@@ -8997,43 +8463,32 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v33, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v33, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX8-NEXT:    v_cmp_gt_f32_e64 s[4:5], v34, v36
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v35, v33, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v33
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v35
+; GFX8-NEXT:    s_and_b64 vcc, s[4:5], vcc
+; GFX8-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v50, v37, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v34, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v37, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v37
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v37, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
@@ -9043,15 +8498,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v35
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
@@ -9061,34 +8514,29 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v36
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v37
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
@@ -9098,33 +8546,30 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v38
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v39
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
@@ -9134,15 +8579,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v48
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
@@ -9152,15 +8595,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v49
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
@@ -9170,15 +8611,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v50
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
@@ -9188,15 +8627,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v51
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
@@ -9206,15 +8643,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v52
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
@@ -9224,15 +8659,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
-; GFX8-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v53
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
@@ -9242,15 +8675,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
-; GFX8-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v54
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -9259,15 +8690,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT:    v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v15
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
@@ -9276,15 +8705,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v14
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
@@ -9293,15 +8720,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v55, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v13
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
@@ -9310,15 +8735,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v30, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v12
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
@@ -9327,15 +8750,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v29, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v11
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
@@ -9344,15 +8765,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v28, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v10
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
@@ -9361,15 +8780,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v27, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v9
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
@@ -9378,15 +8795,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v8
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
@@ -9395,18 +8810,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
@@ -9415,15 +8825,16 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
@@ -9432,15 +8843,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
@@ -9449,15 +8858,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
@@ -9466,15 +8873,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
@@ -9483,15 +8888,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
@@ -9500,15 +8903,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
@@ -9517,14 +8918,12 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
@@ -9551,11 +8950,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
 ; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -9565,48 +8964,51 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX900-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX900-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
+; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v51, v50, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v48, v48
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v31
+; GFX900-NEXT:    v_cndmask_b32_e64 v38, v38, v35, s[4:5]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX900-NEXT:    v_cmp_gt_f32_e64 s[6:7], v39, v48
+; GFX900-NEXT:    v_cndmask_b32_e64 v32, v32, v31, s[6:7]
+; GFX900-NEXT:    v_cmp_gt_f32_e64 s[6:7], v36, v49
+; GFX900-NEXT:    v_cndmask_b32_e64 v36, v38, v35, s[6:7]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v32
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX900-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX900-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[6:7], 0, v38
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v35
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[8:9], 0, v39
+; GFX900-NEXT:    s_and_b64 vcc, s[6:7], vcc
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX900-NEXT:    s_and_b64 vcc, s[8:9], s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v36, v35, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
 ; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
+; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
 ; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
 ; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
 ; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
@@ -9617,46 +9019,34 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
 ; GFX900-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
 ; GFX900-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
-; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v33, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v32
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX900-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX900-NEXT:    v_cmp_gt_f32_e64 s[4:5], v34, v36
+; GFX900-NEXT:    v_cndmask_b32_e64 v34, v35, v33, s[4:5]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v33
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v35
+; GFX900-NEXT:    s_and_b64 vcc, s[4:5], vcc
+; GFX900-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v50, v37, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v34, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v37, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v37
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v37, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
@@ -9666,15 +9056,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v35
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
@@ -9684,34 +9072,29 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v36
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v37
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
@@ -9721,33 +9104,30 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v38
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v39
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
@@ -9757,15 +9137,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v48
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
@@ -9775,15 +9153,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v49
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
@@ -9793,15 +9169,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v50
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
@@ -9811,15 +9185,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v51
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
@@ -9829,15 +9201,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v52
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
@@ -9847,15 +9217,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
-; GFX900-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v53
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
@@ -9865,15 +9233,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
-; GFX900-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v54
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -9882,15 +9248,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT:    v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v15
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
@@ -9899,15 +9263,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v30
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v14
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
@@ -9916,15 +9278,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v55, v30
-; GFX900-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v29
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v13
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
@@ -9933,15 +9293,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v30, v29
-; GFX900-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v12
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
@@ -9950,15 +9308,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v29, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v27
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v11
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
@@ -9967,15 +9323,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v28, v27
-; GFX900-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v10
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
@@ -9984,52 +9338,46 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v27, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v9
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
-; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v8
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
@@ -10038,15 +9386,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
@@ -10055,15 +9401,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
@@ -10072,15 +9416,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
@@ -10089,15 +9431,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
@@ -10106,15 +9446,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
@@ -10123,15 +9461,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
@@ -10140,14 +9476,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v54, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v53, v1, s4
 ; GFX900-NEXT:    v_perm_b32 v2, v52, v2, s4
@@ -10161,9 +9496,9 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_perm_b32 v10, v36, v10, s4
 ; GFX900-NEXT:    v_perm_b32 v11, v35, v11, s4
 ; GFX900-NEXT:    v_perm_b32 v12, v34, v12, s4
-; GFX900-NEXT:    v_perm_b32 v13, v33, v13, s4
+; GFX900-NEXT:    v_perm_b32 v13, v32, v13, s4
 ; GFX900-NEXT:    v_perm_b32 v14, v31, v14, s4
-; GFX900-NEXT:    v_perm_b32 v15, v32, v15, s4
+; GFX900-NEXT:    v_perm_b32 v15, v33, v15, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14282,14 +13617,12 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14304,14 +13637,12 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14324,22 +13655,17 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14354,14 +13680,12 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14405,17 +13729,15 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14525,15 +13847,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14542,15 +13862,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14567,15 +13885,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14584,14 +13900,12 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -14606,45 +13920,35 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14662,6 +13966,7 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
@@ -14669,24 +13974,19 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -14745,40 +14045,37 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -14942,15 +14239,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -14959,15 +14254,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -14976,14 +14269,12 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -15001,15 +14292,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15018,15 +14307,13 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -15035,14 +14322,12 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -15057,68 +14342,53 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15126,58 +14396,52 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v0, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v9, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -15253,59 +14517,56 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -15519,15 +14780,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -15538,15 +14797,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15555,15 +14812,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -15572,14 +14827,12 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -15599,15 +14852,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -15618,15 +14869,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15635,15 +14884,13 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -15652,14 +14899,12 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v4, v1, s4
@@ -15675,93 +14920,73 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4bf16_no_ieee:
@@ -15770,75 +14995,67 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_sdwa v11, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v7, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v8, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v8
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v9
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15932,80 +15149,78 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index 77c45d20296b4..ecf06f3c2f379 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -33,15 +33,13 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimumnum_bf16:
@@ -56,15 +54,13 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimumnum_bf16:
@@ -77,22 +73,17 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_bf16:
@@ -107,14 +98,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_bf16:
@@ -158,17 +147,15 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_bf16:
@@ -265,53 +252,44 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimumnum_bf16_nnan:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_lt_f32_e64 s[4:5], v3, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v2
+; GFX900-NEXT:    s_and_b64 vcc, s[4:5], vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimumnum_bf16_nnan:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_movk_i32 s0, 0x8000
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_cmp_lt_f32_e64 s[0:1], v3, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[0:1]
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[0:1], 0, v2
+; GFX950-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_bf16_nnan:
@@ -320,14 +298,12 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_bf16_nnan:
@@ -357,15 +333,13 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_bf16_nnan:
@@ -457,16 +431,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -475,15 +447,13 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -500,16 +470,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -518,14 +486,12 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -540,46 +506,36 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -597,6 +553,7 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
@@ -604,24 +561,19 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v2bf16:
@@ -680,40 +632,37 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v2bf16:
@@ -864,28 +813,23 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v3, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_sdwa v1, v1, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -895,28 +839,23 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v1, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -927,66 +866,49 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
+; GFX950-NEXT:    v_cndmask_b32_sdwa v1, v1, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2bf16_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v5, v4
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v1, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v5
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1179,16 +1101,14 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1197,15 +1117,13 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -1214,14 +1132,12 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1239,16 +1155,14 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1257,15 +1171,13 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -1274,14 +1186,12 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1296,69 +1206,54 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1366,58 +1261,52 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v0, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v9, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v3bf16:
@@ -1493,59 +1382,56 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16:
@@ -1746,39 +1632,33 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1789,39 +1669,33 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1832,93 +1706,71 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3bf16_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -2167,16 +2019,14 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -2187,15 +2037,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2204,15 +2052,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -2221,14 +2067,12 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -2248,16 +2092,14 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -2268,15 +2110,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2285,15 +2125,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -2302,14 +2140,12 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v4, v1, s4
@@ -2325,94 +2161,74 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v5
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4bf16:
@@ -2421,75 +2237,67 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_sdwa v11, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v7, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v8, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v8
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v9
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2583,80 +2391,78 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2908,53 +2714,44 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -2967,52 +2764,44 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
@@ -3024,68 +2813,49 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v1, v4, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT:    v_perm_b32 v1, v1, v4, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3093,53 +2863,45 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v12, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v13
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v4
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v13, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3457,16 +3219,14 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -3477,15 +3237,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v7
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
@@ -3496,15 +3254,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v8
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3513,15 +3269,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -3530,15 +3284,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -3547,14 +3299,12 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
@@ -3577,16 +3327,14 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v6
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -3597,15 +3345,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v7
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
@@ -3616,15 +3362,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v8
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3633,15 +3377,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -3650,15 +3392,13 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -3667,14 +3407,12 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v8, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v7, v1, s4
@@ -3689,146 +3427,115 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v6
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v7
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v8
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v8, v0, s0
 ; GFX950-NEXT:    v_perm_b32 v1, v7, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-NEXT:    v_perm_b32 v2, v6, v2, s0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_perm_b32 v0, v8, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v6bf16:
@@ -3836,113 +3543,101 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_sdwa v12, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_sdwa v14, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v10, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v15, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v10, v12, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v14, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v8, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v11, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v9, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v9, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v11, v11
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v13, v11
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v15, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v10
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v11
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
 ; GFX10-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v2, v7, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4074,114 +3769,113 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v12, 16, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v15, v14 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v14, v10 :: v_dual_lshlrev_b32 v15, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v12, v10, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v9, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v13, v12
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v15, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v10, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4557,16 +4251,14 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v8
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -4577,15 +4269,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v9
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
@@ -4596,15 +4286,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v10
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -4615,15 +4303,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v11
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4632,15 +4318,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
@@ -4649,15 +4333,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -4666,15 +4348,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
@@ -4683,14 +4363,12 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
@@ -4715,16 +4393,14 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v8
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -4735,15 +4411,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v9
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
@@ -4754,15 +4428,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v10
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -4773,15 +4445,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v11
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4790,15 +4460,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
@@ -4807,15 +4475,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -4824,15 +4490,13 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
@@ -4841,14 +4505,12 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v11, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v10, v1, s4
@@ -4864,194 +4526,152 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v8
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
-; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v9
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
-; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v10
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v11
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v11, v0, s0
 ; GFX950-NEXT:    v_perm_b32 v1, v10, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX950-NEXT:    v_perm_b32 v2, v9, v2, s0
 ; GFX950-NEXT:    v_perm_b32 v3, v8, v3, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_perm_b32 v0, v11, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v8bf16:
@@ -5060,151 +4680,135 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v12, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v18, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v17, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v13
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX10-NEXT:    s_and_b32 s5, s7, s8
 ; GFX10-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s5
+; GFX10-NEXT:    s_and_b32 s5, s9, s10
 ; GFX10-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s5
+; GFX10-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v8bf16:
@@ -6033,16 +5637,14 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v19
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v17
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
@@ -6053,15 +5655,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v18
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
@@ -6072,15 +5672,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v19
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
@@ -6091,15 +5689,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v20
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
@@ -6110,15 +5706,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v21
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
@@ -6129,15 +5723,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v22
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
@@ -6148,15 +5740,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v23
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
@@ -6167,15 +5757,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v24
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6184,15 +5772,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v7
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
@@ -6201,15 +5787,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
@@ -6218,15 +5802,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
@@ -6235,15 +5817,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
@@ -6252,15 +5832,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
@@ -6269,15 +5847,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
@@ -6286,15 +5862,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -6303,14 +5877,12 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
@@ -6343,16 +5915,14 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v19
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v17
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
@@ -6363,15 +5933,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v18
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
@@ -6382,15 +5950,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v19
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
@@ -6401,15 +5967,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v20
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
@@ -6420,15 +5984,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v21
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
@@ -6439,15 +6001,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v22
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
@@ -6458,15 +6018,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v23
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
@@ -6477,15 +6035,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v24
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6494,15 +6050,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v7
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
@@ -6511,15 +6065,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v6
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
@@ -6528,15 +6080,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
@@ -6545,15 +6095,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v4
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
@@ -6562,15 +6110,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
@@ -6579,15 +6125,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
@@ -6596,15 +6140,13 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -6613,14 +6155,12 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v23, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v22, v1, s4
@@ -6640,383 +6180,297 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v16
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v19
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v16
-; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v17
-; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
-; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v17
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
-; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v18
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
-; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v19
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
-; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v20
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
-; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v22, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v21
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
-; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v23, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v22
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
-; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v24, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v23
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v23, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v7
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v15
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
-; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v6
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v14
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v4
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v13
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v11
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v10
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v23, v0, s0
 ; GFX950-NEXT:    v_perm_b32 v1, v22, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
 ; GFX950-NEXT:    v_perm_b32 v2, v21, v2, s0
 ; GFX950-NEXT:    v_perm_b32 v3, v20, v3, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX950-NEXT:    v_perm_b32 v0, v23, v0, s0
 ; GFX950-NEXT:    v_perm_b32 v4, v19, v4, s0
 ; GFX950-NEXT:    v_perm_b32 v5, v18, v5, s0
 ; GFX950-NEXT:    v_perm_b32 v6, v17, v6, s0
@@ -8982,50 +8436,49 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v48, v48
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
+; GFX8-NEXT:    v_cndmask_b32_e64 v38, v38, v35, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v32
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v39
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[6:7], v37, v39
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v32, v31, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[6:7], v36, v48
+; GFX8-NEXT:    s_movk_i32 s10, 0x8000
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v38, v35, s[6:7]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v32
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s10, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[6:7], 0, v37
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v35
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[8:9], 0, v38
+; GFX8-NEXT:    s_and_b64 vcc, s[6:7], vcc
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX8-NEXT:    s_and_b64 vcc, s[8:9], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v36, v35, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
 ; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
@@ -9037,43 +8490,34 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v33, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v33, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v34, v36
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v35, v33, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s10, v33
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v35
+; GFX8-NEXT:    s_and_b64 vcc, s[4:5], vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v51, v50, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v50, v34, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v35, v37
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v34, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v34
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
@@ -9083,15 +8527,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v35
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
@@ -9101,88 +8543,80 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v36
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v37
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v38
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v39
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v48
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
@@ -9192,15 +8626,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v49
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
@@ -9210,15 +8642,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v50
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
@@ -9228,15 +8658,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v51
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
@@ -9246,15 +8674,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v52
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
@@ -9264,15 +8690,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
-; GFX8-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v53
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
@@ -9282,15 +8706,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
-; GFX8-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v54
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -9299,15 +8721,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT:    v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v15
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
@@ -9316,15 +8736,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v14
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
@@ -9333,15 +8751,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v55, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v13
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
@@ -9350,15 +8766,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v30, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v12
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
@@ -9367,15 +8781,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v29, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v11
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
@@ -9384,15 +8796,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v28, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v10
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
@@ -9401,15 +8811,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v27, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v9
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
@@ -9418,15 +8826,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v8
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
@@ -9435,18 +8841,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v7
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
@@ -9455,15 +8856,16 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v6
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
@@ -9472,15 +8874,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
@@ -9489,15 +8889,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v4
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
@@ -9506,15 +8904,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
@@ -9523,15 +8919,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v2
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
@@ -9540,15 +8934,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
@@ -9557,14 +8949,12 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
@@ -9591,11 +8981,11 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
 ; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -9605,50 +8995,49 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX900-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX900-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v48, v48
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
+; GFX900-NEXT:    v_cndmask_b32_e64 v38, v38, v35, s[4:5]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v32
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v39
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX900-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX900-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
+; GFX900-NEXT:    v_cmp_lt_f32_e64 s[6:7], v37, v39
+; GFX900-NEXT:    v_cndmask_b32_e64 v32, v32, v31, s[6:7]
+; GFX900-NEXT:    v_cmp_lt_f32_e64 s[6:7], v36, v48
+; GFX900-NEXT:    s_movk_i32 s10, 0x8000
+; GFX900-NEXT:    v_cndmask_b32_e64 v36, v38, v35, s[6:7]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v32
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s10, v31
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[6:7], 0, v37
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v35
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[8:9], 0, v38
+; GFX900-NEXT:    s_and_b64 vcc, s[6:7], vcc
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX900-NEXT:    s_and_b64 vcc, s[8:9], s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v36, v35, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
 ; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
 ; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
-; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
 ; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
 ; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
@@ -9660,43 +9049,34 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
 ; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v33, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v32
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX900-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX900-NEXT:    v_cmp_lt_f32_e64 s[4:5], v34, v36
+; GFX900-NEXT:    v_cndmask_b32_e64 v34, v35, v33, s[4:5]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s10, v33
+; GFX900-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v35
+; GFX900-NEXT:    s_and_b64 vcc, s[4:5], vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v51, v50, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v50, v34, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v35, v37
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v34, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v34
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
@@ -9706,15 +9086,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v35
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
@@ -9724,88 +9102,80 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v36
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v37
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v38
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v39
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v48
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
@@ -9815,15 +9185,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v49
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
@@ -9833,15 +9201,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v50
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
@@ -9851,15 +9217,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v51
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
@@ -9869,15 +9233,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v52
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
@@ -9887,15 +9249,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
-; GFX900-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v53
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
@@ -9905,15 +9265,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
-; GFX900-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v54
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -9922,15 +9280,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT:    v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v15
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
@@ -9939,15 +9295,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v30
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v14
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
@@ -9956,15 +9310,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v55, v30
-; GFX900-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v29
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v13
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
@@ -9973,15 +9325,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v30, v29
-; GFX900-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v12
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
@@ -9990,15 +9340,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v29, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v27
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v11
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
@@ -10007,15 +9355,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v28, v27
-; GFX900-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v10
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
@@ -10024,52 +9370,46 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v27, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v9
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
-; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v8
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v7
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
@@ -10078,15 +9418,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v6
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
@@ -10095,15 +9433,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
@@ -10112,15 +9448,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v4
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
@@ -10129,15 +9463,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
@@ -10146,15 +9478,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v2
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
@@ -10163,15 +9493,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
@@ -10180,14 +9508,12 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s10, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v54, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v53, v1, s4
@@ -10202,9 +9528,9 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_perm_b32 v10, v36, v10, s4
 ; GFX900-NEXT:    v_perm_b32 v11, v35, v11, s4
 ; GFX900-NEXT:    v_perm_b32 v12, v34, v12, s4
-; GFX900-NEXT:    v_perm_b32 v13, v33, v13, s4
+; GFX900-NEXT:    v_perm_b32 v13, v32, v13, s4
 ; GFX900-NEXT:    v_perm_b32 v14, v31, v14, s4
-; GFX900-NEXT:    v_perm_b32 v15, v32, v15, s4
+; GFX900-NEXT:    v_perm_b32 v15, v33, v15, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10213,771 +9539,604 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v50, off, s32
 ; GFX950-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX950-NEXT:    v_cmp_u_f32_e64 s[0:1], v48, v48
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX950-NEXT:    v_cndmask_b32_e64 v38, v38, v35, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v32
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v39
-; GFX950-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v31
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v22
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
+; GFX950-NEXT:    v_cmp_lt_f32_e64 s[2:3], v37, v39
+; GFX950-NEXT:    s_movk_i32 s6, 0x8000
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v31
+; GFX950-NEXT:    v_cndmask_b32_e64 v32, v32, v31, s[2:3]
+; GFX950-NEXT:    v_cmp_lt_f32_e64 s[2:3], v36, v48
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v32
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v35
+; GFX950-NEXT:    v_cndmask_b32_e64 v36, v38, v35, s[2:3]
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[2:3], 0, v37
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v38
+; GFX950-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX950-NEXT:    s_and_b64 vcc, s[4:5], s[0:1]
+; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v36, v35, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v21
 ; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v20
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v34
 ; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v19
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v38
 ; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
 ; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX950-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
 ; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
 ; GFX950-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v50
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v50
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v33, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v32
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v50
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v33, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX950-NEXT:    v_cmp_lt_f32_e64 s[0:1], v34, v36
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v33
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX950-NEXT:    v_cndmask_b32_e64 v34, v35, v33, s[0:1]
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[0:1], 0, v35
+; GFX950-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v37
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v52, v51, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v34
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v51, v34, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v35, v37
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v34
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
-; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v34, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v22
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v35
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
-; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v36
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v37
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
-; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v38
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v39
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v51
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v49, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v48
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v49, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v51
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
-; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v51, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v49
 ; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v49, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v51, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v49, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v20
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v51
 ; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
-; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v19
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v52
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v55
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
-; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v54, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v53
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v55, v40
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v54, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
-; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v17
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v55, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v54
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v54, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v40, v55, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
-; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v54, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v55, v54, vcc
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v40, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v55
 ; GFX950-NEXT:    v_cndmask_b32_e32 v40, v40, v55, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
 ; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v41, v40, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v40
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v40, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v40, v40, v55, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v40, v55, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v15
 ; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v15, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v40
 ; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v40, v50, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v15
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v50
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v40
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v50, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v14
 ; GFX950-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v50
 ; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v50, v30, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v30
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v50
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v13
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v13
 ; GFX950-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v30
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v29
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
-; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v12
 ; GFX950-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v30, v29
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v28
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v11
 ; GFX950-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v29, v28
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v27
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v10
 ; GFX950-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v28, v27
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v26
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
+; GFX950-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v9
 ; GFX950-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v27, v26
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v25
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v8
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v25
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v7
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v6
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v23
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v22
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v4
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v21
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v20
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v19
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v18
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s6, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v17
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v16
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v55, v0, s0
 ; GFX950-NEXT:    v_perm_b32 v1, v54, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
 ; GFX950-NEXT:    v_perm_b32 v2, v53, v2, s0
 ; GFX950-NEXT:    v_perm_b32 v3, v52, v3, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX950-NEXT:    v_perm_b32 v0, v55, v0, s0
 ; GFX950-NEXT:    v_perm_b32 v4, v51, v4, s0
 ; GFX950-NEXT:    v_perm_b32 v5, v49, v5, s0
 ; GFX950-NEXT:    v_perm_b32 v6, v48, v6, s0
@@ -14337,15 +13496,13 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14360,15 +13517,13 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s4, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14381,22 +13536,17 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14411,14 +13561,12 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14462,17 +13610,15 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14582,16 +13728,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14600,15 +13744,13 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14625,16 +13767,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v3
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14643,14 +13783,12 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -14665,46 +13803,36 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14722,6 +13850,7 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
@@ -14729,24 +13858,19 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -14805,40 +13929,37 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -15002,16 +14123,14 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15020,15 +14139,13 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -15037,14 +14154,12 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -15062,16 +14177,14 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15080,15 +14193,13 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -15097,14 +14208,12 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -15119,69 +14228,54 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15189,58 +14283,52 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v0, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v9, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -15316,59 +14404,56 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -15582,16 +14667,14 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -15602,15 +14685,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15619,15 +14700,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -15636,14 +14715,12 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX8-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -15663,16 +14740,14 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT:    s_movk_i32 s6, 0x8000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -15683,15 +14758,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v5
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15700,15 +14773,13 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v1
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -15717,14 +14788,12 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], s6, v0
+; GFX900-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v4, v1, s4
@@ -15740,94 +14809,74 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_movk_i32 s2, 0x8000
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v5
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], s2, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4bf16_no_ieee:
@@ -15836,75 +14885,67 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_sdwa v11, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v6
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v7, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v8, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v8
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v9
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15998,80 +15039,78 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll
index 7aaf00f871136..7d9f9d2fff000 100644
--- a/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc --mtriple=mipsisa32r6 < %s | FileCheck %s --check-prefix=MIPS32R6
-; RUN: llc --mtriple=mips64 < %s | FileCheck %s --check-prefix=MIPS64R2
+; RUN: llc --mtriple=mips64 -mattr=+mips64r2 < %s | FileCheck %s --check-prefix=MIPS64R2
+; RUN: llc --mtriple=mips64 -mattr=+mips64 < %s | FileCheck %s --check-prefix=MIPS64
+; RUN: llc --mtriple=mips -mattr=+mips32r2 < %s | FileCheck %s --check-prefix=MIPS32R2
+; RUN: llc --mtriple=mips -mattr=+mips32 < %s | FileCheck %s --check-prefix=MIPS32
 
 declare float @llvm.maximumnum.f32(float, float)
 declare double @llvm.maximumnum.f64(double, double)
@@ -17,22 +20,87 @@ define float @maximumnum_float(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_float:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.s $f0, $f13
 ; MIPS64R2-NEXT:    c.un.s $f12, $f12
 ; MIPS64R2-NEXT:    movt.s $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.s $f13, $f13
-; MIPS64R2-NEXT:    movt.s $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.ule.s $f12, $f13
-; MIPS64R2-NEXT:    mov.s $f0, $f13
+; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64R2-NEXT:    c.ule.s $f12, $f0
 ; MIPS64R2-NEXT:    movf.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
 ; MIPS64R2-NEXT:    mov.s $f1, $f0
 ; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: maximumnum_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.s $f0, $f13
+; MIPS64-NEXT:    c.un.s $f12, $f12
+; MIPS64-NEXT:    movt.s $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.s $f13, $f13
+; MIPS64-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.ule.s $f12, $f0
+; MIPS64-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    mfc1 $1, $f12
+; MIPS64-NEXT:    mov.s $f1, $f0
+; MIPS64-NEXT:    movz.s $f1, $f12, $1
+; MIPS64-NEXT:    mtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.s $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: maximumnum_float:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.s $f0, $f14
+; MIPS32R2-NEXT:    c.un.s $f12, $f12
+; MIPS32R2-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.s $f14, $f14
+; MIPS32R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.ule.s $f12, $f0
+; MIPS32R2-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    mfc1 $1, $f12
+; MIPS32R2-NEXT:    mov.s $f1, $f0
+; MIPS32R2-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f2
+; MIPS32R2-NEXT:    c.eq.s $f0, $f2
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32-LABEL: maximumnum_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.s $f0, $f14
+; MIPS32-NEXT:    c.un.s $f12, $f12
+; MIPS32-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.s $f14, $f14
+; MIPS32-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.ule.s $f12, $f0
+; MIPS32-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    mfc1 $1, $f12
+; MIPS32-NEXT:    mov.s $f1, $f0
+; MIPS32-NEXT:    movz.s $f1, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f2
+; MIPS32-NEXT:    c.eq.s $f0, $f2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.s $f0, $f1, $fcc0
+; MIPS32R5-LABEL: maximumnum_float:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.s $f0, $f14
+; MIPS32R5-NEXT:    c.un.s $f12, $f12
+; MIPS32R5-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.s $f14, $f14
+; MIPS32R5-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.ule.s $f12, $f0
+; MIPS32R5-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    mfc1 $1, $f12
+; MIPS32R5-NEXT:    mov.s $f1, $f0
+; MIPS32R5-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.s $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.s $f0, $f1, $fcc0
   %z = call float @llvm.maximumnum.f32(float %x, float %y)
   ret float %z
 }
@@ -55,6 +123,49 @@ define float @maximumnum_float_nsz(float %x, float %y) {
 ; MIPS64R2-NEXT:    c.ule.s $f12, $f0
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movf.s $f0, $f12, $fcc0
+;
+; MIPS64-LABEL: maximumnum_float_nsz:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.s $f0, $f13
+; MIPS64-NEXT:    c.un.s $f12, $f12
+; MIPS64-NEXT:    movt.s $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.s $f13, $f13
+; MIPS64-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.ule.s $f12, $f0
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movf.s $f0, $f12, $fcc0
+;
+; MIPS32R2-LABEL: maximumnum_float_nsz:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.s $f0, $f14
+; MIPS32R2-NEXT:    c.un.s $f12, $f12
+; MIPS32R2-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.s $f14, $f14
+; MIPS32R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.ule.s $f12, $f0
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movf.s $f0, $f12, $fcc0
+;
+; MIPS32-LABEL: maximumnum_float_nsz:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.s $f0, $f14
+; MIPS32-NEXT:    c.un.s $f12, $f12
+; MIPS32-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.s $f14, $f14
+; MIPS32-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.ule.s $f12, $f0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS32R5-LABEL: maximumnum_float_nsz:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.s $f0, $f14
+; MIPS32R5-NEXT:    c.un.s $f12, $f12
+; MIPS32R5-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.s $f14, $f14
+; MIPS32R5-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.ule.s $f12, $f0
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movf.s $f0, $f12, $fcc0
   %z = call nsz float @llvm.maximumnum.f32(float %x, float %y)
   ret float %z
 }
@@ -67,18 +178,67 @@ define float @maximumnum_float_nnan(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_float_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.ule.s $f12, $f13
 ; MIPS64R2-NEXT:    mov.s $f0, $f13
+; MIPS64R2-NEXT:    c.ule.s $f12, $f13
 ; MIPS64R2-NEXT:    movf.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
 ; MIPS64R2-NEXT:    mov.s $f1, $f0
 ; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: maximumnum_float_nnan:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.s $f0, $f13
+; MIPS64-NEXT:    c.ule.s $f12, $f13
+; MIPS64-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    mfc1 $1, $f12
+; MIPS64-NEXT:    mov.s $f1, $f0
+; MIPS64-NEXT:    movz.s $f1, $f12, $1
+; MIPS64-NEXT:    mtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.s $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: maximumnum_float_nnan:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.s $f0, $f14
+; MIPS32R2-NEXT:    c.ule.s $f12, $f14
+; MIPS32R2-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    mfc1 $1, $f12
+; MIPS32R2-NEXT:    mov.s $f1, $f0
+; MIPS32R2-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f2
+; MIPS32R2-NEXT:    c.eq.s $f0, $f2
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32-LABEL: maximumnum_float_nnan:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.s $f0, $f14
+; MIPS32-NEXT:    c.ule.s $f12, $f14
+; MIPS32-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    mfc1 $1, $f12
+; MIPS32-NEXT:    mov.s $f1, $f0
+; MIPS32-NEXT:    movz.s $f1, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f2
+; MIPS32-NEXT:    c.eq.s $f0, $f2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.s $f0, $f1, $fcc0
+; MIPS32R5-LABEL: maximumnum_float_nnan:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.s $f0, $f14
+; MIPS32R5-NEXT:    c.ule.s $f12, $f14
+; MIPS32R5-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    mfc1 $1, $f12
+; MIPS32R5-NEXT:    mov.s $f1, $f0
+; MIPS32R5-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.s $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.s $f0, $f1, $fcc0
   %z = call nnan float @llvm.maximumnum.f32(float %x, float %y)
   ret float %z
 }
@@ -94,22 +254,93 @@ define double @maximumnum_double(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_double:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.d $f0, $f13
 ; MIPS64R2-NEXT:    c.un.d $f12, $f12
 ; MIPS64R2-NEXT:    movt.d $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.d $f13, $f13
-; MIPS64R2-NEXT:    movt.d $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.ule.d $f12, $f13
-; MIPS64R2-NEXT:    mov.d $f0, $f13
+; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64R2-NEXT:    c.ule.d $f12, $f0
 ; MIPS64R2-NEXT:    movf.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmfc1 $1, $f12
 ; MIPS64R2-NEXT:    mov.d $f1, $f0
 ; MIPS64R2-NEXT:    movz.d $f1, $f12, $1
-; MIPS64R2-NEXT:    dmfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: maximumnum_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.d $f0, $f13
+; MIPS64-NEXT:    c.un.d $f12, $f12
+; MIPS64-NEXT:    movt.d $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.d $f13, $f13
+; MIPS64-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.ule.d $f12, $f0
+; MIPS64-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    dmfc1 $1, $f12
+; MIPS64-NEXT:    mov.d $f1, $f0
+; MIPS64-NEXT:    movz.d $f1, $f12, $1
+; MIPS64-NEXT:    dmtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.d $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: maximumnum_double:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.d $f0, $f14
+; MIPS32R2-NEXT:    c.un.d $f12, $f12
+; MIPS32R2-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.d $f14, $f14
+; MIPS32R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.ule.d $f12, $f0
+; MIPS32R2-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    cvt.s.d $f2, $f12
+; MIPS32R2-NEXT:    mfc1 $1, $f2
+; MIPS32R2-NEXT:    mov.d $f2, $f0
+; MIPS32R2-NEXT:    movz.d $f2, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f4
+; MIPS32R2-NEXT:    mthc1 $zero, $f4
+; MIPS32R2-NEXT:    c.eq.d $f0, $f4
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.d $f0, $f2, $fcc0
+;
+; MIPS32-LABEL: maximumnum_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.d $f0, $f14
+; MIPS32-NEXT:    c.un.d $f12, $f12
+; MIPS32-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.d $f14, $f14
+; MIPS32-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.ule.d $f12, $f0
+; MIPS32-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    cvt.s.d $f2, $f12
+; MIPS32-NEXT:    mfc1 $1, $f2
+; MIPS32-NEXT:    mov.d $f2, $f0
+; MIPS32-NEXT:    movz.d $f2, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f4
+; MIPS32-NEXT:    mtc1 $zero, $f5
+; MIPS32-NEXT:    c.eq.d $f0, $f4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.d $f0, $f2, $fcc0
+; MIPS32R5-LABEL: maximumnum_double:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.d $f0, $f14
+; MIPS32R5-NEXT:    c.un.d $f12, $f12
+; MIPS32R5-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.d $f14, $f14
+; MIPS32R5-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.ule.d $f12, $f0
+; MIPS32R5-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    cvt.s.d $f1, $f12
+; MIPS32R5-NEXT:    mfc1 $1, $f1
+; MIPS32R5-NEXT:    mov.d $f1, $f0
+; MIPS32R5-NEXT:    movz.d $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    mthc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.d $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.d $f0, $f1, $fcc0
   %z = call double @llvm.maximumnum.f64(double %x, double %y)
   ret double %z
 }
@@ -132,6 +363,49 @@ define double @maximumnum_double_nsz(double %x, double %y) {
 ; MIPS64R2-NEXT:    c.ule.d $f12, $f0
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movf.d $f0, $f12, $fcc0
+;
+; MIPS64-LABEL: maximumnum_double_nsz:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.d $f0, $f13
+; MIPS64-NEXT:    c.un.d $f12, $f12
+; MIPS64-NEXT:    movt.d $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.d $f13, $f13
+; MIPS64-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.ule.d $f12, $f0
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movf.d $f0, $f12, $fcc0
+;
+; MIPS32R2-LABEL: maximumnum_double_nsz:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.d $f0, $f14
+; MIPS32R2-NEXT:    c.un.d $f12, $f12
+; MIPS32R2-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.d $f14, $f14
+; MIPS32R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.ule.d $f12, $f0
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movf.d $f0, $f12, $fcc0
+;
+; MIPS32-LABEL: maximumnum_double_nsz:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.d $f0, $f14
+; MIPS32-NEXT:    c.un.d $f12, $f12
+; MIPS32-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.d $f14, $f14
+; MIPS32-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.ule.d $f12, $f0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS32R5-LABEL: maximumnum_double_nsz:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.d $f0, $f14
+; MIPS32R5-NEXT:    c.un.d $f12, $f12
+; MIPS32R5-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.d $f14, $f14
+; MIPS32R5-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.ule.d $f12, $f0
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movf.d $f0, $f12, $fcc0
   %z = call nsz double @llvm.maximumnum.f64(double %x, double %y)
   ret double %z
 }
@@ -144,18 +418,73 @@ define double @maximumnum_double_nnan(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_double_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.ule.d $f12, $f13
 ; MIPS64R2-NEXT:    mov.d $f0, $f13
+; MIPS64R2-NEXT:    c.ule.d $f12, $f13
 ; MIPS64R2-NEXT:    movf.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmfc1 $1, $f12
 ; MIPS64R2-NEXT:    mov.d $f1, $f0
 ; MIPS64R2-NEXT:    movz.d $f1, $f12, $1
-; MIPS64R2-NEXT:    dmfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: maximumnum_double_nnan:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.d $f0, $f13
+; MIPS64-NEXT:    c.ule.d $f12, $f13
+; MIPS64-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    dmfc1 $1, $f12
+; MIPS64-NEXT:    mov.d $f1, $f0
+; MIPS64-NEXT:    movz.d $f1, $f12, $1
+; MIPS64-NEXT:    dmtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.d $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: maximumnum_double_nnan:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.d $f0, $f14
+; MIPS32R2-NEXT:    c.ule.d $f12, $f14
+; MIPS32R2-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    cvt.s.d $f2, $f12
+; MIPS32R2-NEXT:    mfc1 $1, $f2
+; MIPS32R2-NEXT:    mov.d $f2, $f0
+; MIPS32R2-NEXT:    movz.d $f2, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f4
+; MIPS32R2-NEXT:    mthc1 $zero, $f4
+; MIPS32R2-NEXT:    c.eq.d $f0, $f4
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.d $f0, $f2, $fcc0
+;
+; MIPS32-LABEL: maximumnum_double_nnan:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.d $f0, $f14
+; MIPS32-NEXT:    c.ule.d $f12, $f14
+; MIPS32-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    cvt.s.d $f2, $f12
+; MIPS32-NEXT:    mfc1 $1, $f2
+; MIPS32-NEXT:    mov.d $f2, $f0
+; MIPS32-NEXT:    movz.d $f2, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f4
+; MIPS32-NEXT:    mtc1 $zero, $f5
+; MIPS32-NEXT:    c.eq.d $f0, $f4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.d $f0, $f2, $fcc0
+; MIPS32R5-LABEL: maximumnum_double_nnan:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.d $f0, $f14
+; MIPS32R5-NEXT:    c.ule.d $f12, $f14
+; MIPS32R5-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    cvt.s.d $f1, $f12
+; MIPS32R5-NEXT:    mfc1 $1, $f1
+; MIPS32R5-NEXT:    mov.d $f1, $f0
+; MIPS32R5-NEXT:    movz.d $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    mthc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.d $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.d $f0, $f1, $fcc0
   %z = call nnan double @llvm.maximumnum.f64(double %x, double %y)
   ret double %z
 }
@@ -170,25 +499,97 @@ define float @minimumnum_float(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_float:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.s $f0, $f13
 ; MIPS64R2-NEXT:    c.un.s $f12, $f12
 ; MIPS64R2-NEXT:    movt.s $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.s $f13, $f13
-; MIPS64R2-NEXT:    movt.s $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.olt.s $f12, $f13
-; MIPS64R2-NEXT:    mov.s $f0, $f13
+; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64R2-NEXT:    c.olt.s $f12, $f0
 ; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
 ; MIPS64R2-NEXT:    lui $2, 32768
 ; MIPS64R2-NEXT:    xor $1, $1, $2
 ; MIPS64R2-NEXT:    mov.s $f1, $f0
 ; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    xor $1, $1, $2
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: minimumnum_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.s $f0, $f13
+; MIPS64-NEXT:    c.un.s $f12, $f12
+; MIPS64-NEXT:    movt.s $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.s $f13, $f13
+; MIPS64-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.olt.s $f12, $f0
+; MIPS64-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    mfc1 $1, $f12
+; MIPS64-NEXT:    lui $2, 32768
+; MIPS64-NEXT:    xor $1, $1, $2
+; MIPS64-NEXT:    mov.s $f1, $f0
+; MIPS64-NEXT:    movz.s $f1, $f12, $1
+; MIPS64-NEXT:    mtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.s $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: minimumnum_float:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.s $f0, $f14
+; MIPS32R2-NEXT:    c.un.s $f12, $f12
+; MIPS32R2-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.s $f14, $f14
+; MIPS32R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.olt.s $f12, $f0
+; MIPS32R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    mfc1 $1, $f12
+; MIPS32R2-NEXT:    lui $2, 32768
+; MIPS32R2-NEXT:    xor $1, $1, $2
+; MIPS32R2-NEXT:    mov.s $f1, $f0
+; MIPS32R2-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f2
+; MIPS32R2-NEXT:    c.eq.s $f0, $f2
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32-LABEL: minimumnum_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.s $f0, $f14
+; MIPS32-NEXT:    c.un.s $f12, $f12
+; MIPS32-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.s $f14, $f14
+; MIPS32-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.olt.s $f12, $f0
+; MIPS32-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    mfc1 $1, $f12
+; MIPS32-NEXT:    lui $2, 32768
+; MIPS32-NEXT:    xor $1, $1, $2
+; MIPS32-NEXT:    mov.s $f1, $f0
+; MIPS32-NEXT:    movz.s $f1, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f2
+; MIPS32-NEXT:    c.eq.s $f0, $f2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.s $f0, $f1, $fcc0
+; MIPS32R5-LABEL: minimumnum_float:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.s $f0, $f14
+; MIPS32R5-NEXT:    c.un.s $f12, $f12
+; MIPS32R5-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.s $f14, $f14
+; MIPS32R5-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.olt.s $f12, $f0
+; MIPS32R5-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    mfc1 $1, $f12
+; MIPS32R5-NEXT:    lui $2, 32768
+; MIPS32R5-NEXT:    xor $1, $1, $2
+; MIPS32R5-NEXT:    mov.s $f1, $f0
+; MIPS32R5-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.s $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.s $f0, $f1, $fcc0
   %z = call float @llvm.minimumnum.f32(float %x, float %y)
   ret float %z
 }
@@ -211,6 +612,49 @@ define float @minimumnum_float_nsz(float %x, float %y) {
 ; MIPS64R2-NEXT:    c.olt.s $f12, $f0
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
+;
+; MIPS64-LABEL: minimumnum_float_nsz:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.s $f0, $f13
+; MIPS64-NEXT:    c.un.s $f12, $f12
+; MIPS64-NEXT:    movt.s $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.s $f13, $f13
+; MIPS64-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.olt.s $f12, $f0
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.s $f0, $f12, $fcc0
+;
+; MIPS32R2-LABEL: minimumnum_float_nsz:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.s $f0, $f14
+; MIPS32R2-NEXT:    c.un.s $f12, $f12
+; MIPS32R2-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.s $f14, $f14
+; MIPS32R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.olt.s $f12, $f0
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.s $f0, $f12, $fcc0
+;
+; MIPS32-LABEL: minimumnum_float_nsz:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.s $f0, $f14
+; MIPS32-NEXT:    c.un.s $f12, $f12
+; MIPS32-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.s $f14, $f14
+; MIPS32-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.olt.s $f12, $f0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R5-LABEL: minimumnum_float_nsz:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.s $f0, $f14
+; MIPS32R5-NEXT:    c.un.s $f12, $f12
+; MIPS32R5-NEXT:    movt.s $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.s $f14, $f14
+; MIPS32R5-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.olt.s $f12, $f0
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.s $f0, $f12, $fcc0
   %z = call nsz float @llvm.minimumnum.f32(float %x, float %y)
   ret float %z
 }
@@ -223,21 +667,77 @@ define float @minimumnum_float_nnan(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_float_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.olt.s $f12, $f13
 ; MIPS64R2-NEXT:    mov.s $f0, $f13
+; MIPS64R2-NEXT:    c.olt.s $f12, $f13
 ; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
 ; MIPS64R2-NEXT:    lui $2, 32768
 ; MIPS64R2-NEXT:    xor $1, $1, $2
 ; MIPS64R2-NEXT:    mov.s $f1, $f0
 ; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    xor $1, $1, $2
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: minimumnum_float_nnan:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.s $f0, $f13
+; MIPS64-NEXT:    c.olt.s $f12, $f13
+; MIPS64-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS64-NEXT:    mfc1 $1, $f12
+; MIPS64-NEXT:    lui $2, 32768
+; MIPS64-NEXT:    xor $1, $1, $2
+; MIPS64-NEXT:    mov.s $f1, $f0
+; MIPS64-NEXT:    movz.s $f1, $f12, $1
+; MIPS64-NEXT:    mtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.s $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: minimumnum_float_nnan:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.s $f0, $f14
+; MIPS32R2-NEXT:    c.olt.s $f12, $f14
+; MIPS32R2-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    mfc1 $1, $f12
+; MIPS32R2-NEXT:    lui $2, 32768
+; MIPS32R2-NEXT:    xor $1, $1, $2
+; MIPS32R2-NEXT:    mov.s $f1, $f0
+; MIPS32R2-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f2
+; MIPS32R2-NEXT:    c.eq.s $f0, $f2
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS32-LABEL: minimumnum_float_nnan:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.s $f0, $f14
+; MIPS32-NEXT:    c.olt.s $f12, $f14
+; MIPS32-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32-NEXT:    mfc1 $1, $f12
+; MIPS32-NEXT:    lui $2, 32768
+; MIPS32-NEXT:    xor $1, $1, $2
+; MIPS32-NEXT:    mov.s $f1, $f0
+; MIPS32-NEXT:    movz.s $f1, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f2
+; MIPS32-NEXT:    c.eq.s $f0, $f2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.s $f0, $f1, $fcc0
+; MIPS32R5-LABEL: minimumnum_float_nnan:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.s $f0, $f14
+; MIPS32R5-NEXT:    c.olt.s $f12, $f14
+; MIPS32R5-NEXT:    movt.s $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    mfc1 $1, $f12
+; MIPS32R5-NEXT:    lui $2, 32768
+; MIPS32R5-NEXT:    xor $1, $1, $2
+; MIPS32R5-NEXT:    mov.s $f1, $f0
+; MIPS32R5-NEXT:    movz.s $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.s $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.s $f0, $f1, $fcc0
   %z = call nnan float @llvm.minimumnum.f32(float %x, float %y)
   ret float %z
 }
@@ -252,12 +752,12 @@ define double @minimumnum_double(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_double:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.d $f0, $f13
 ; MIPS64R2-NEXT:    c.un.d $f12, $f12
 ; MIPS64R2-NEXT:    movt.d $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.d $f13, $f13
-; MIPS64R2-NEXT:    movt.d $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.olt.d $f12, $f13
-; MIPS64R2-NEXT:    mov.d $f0, $f13
+; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64R2-NEXT:    c.olt.d $f12, $f0
 ; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmfc1 $1, $f12
 ; MIPS64R2-NEXT:    daddiu $2, $zero, 1
@@ -265,13 +765,92 @@ define double @minimumnum_double(double %x, double %y) {
 ; MIPS64R2-NEXT:    xor $1, $1, $2
 ; MIPS64R2-NEXT:    mov.d $f1, $f0
 ; MIPS64R2-NEXT:    movz.d $f1, $f12, $1
-; MIPS64R2-NEXT:    dmfc1 $1, $f13
-; MIPS64R2-NEXT:    xor $1, $1, $2
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: minimumnum_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.d $f0, $f13
+; MIPS64-NEXT:    c.un.d $f12, $f12
+; MIPS64-NEXT:    movt.d $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.d $f13, $f13
+; MIPS64-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.olt.d $f12, $f0
+; MIPS64-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    dmfc1 $1, $f12
+; MIPS64-NEXT:    daddiu $2, $zero, 1
+; MIPS64-NEXT:    dsll $2, $2, 63
+; MIPS64-NEXT:    xor $1, $1, $2
+; MIPS64-NEXT:    mov.d $f1, $f0
+; MIPS64-NEXT:    movz.d $f1, $f12, $1
+; MIPS64-NEXT:    dmtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.d $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: minimumnum_double:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.d $f0, $f14
+; MIPS32R2-NEXT:    c.un.d $f12, $f12
+; MIPS32R2-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.d $f14, $f14
+; MIPS32R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.olt.d $f12, $f0
+; MIPS32R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    cvt.s.d $f2, $f12
+; MIPS32R2-NEXT:    mfc1 $1, $f2
+; MIPS32R2-NEXT:    lui $2, 32768
+; MIPS32R2-NEXT:    xor $1, $1, $2
+; MIPS32R2-NEXT:    mov.d $f2, $f0
+; MIPS32R2-NEXT:    movz.d $f2, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f4
+; MIPS32R2-NEXT:    mthc1 $zero, $f4
+; MIPS32R2-NEXT:    c.eq.d $f0, $f4
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.d $f0, $f2, $fcc0
+;
+; MIPS32-LABEL: minimumnum_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.d $f0, $f14
+; MIPS32-NEXT:    c.un.d $f12, $f12
+; MIPS32-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.d $f14, $f14
+; MIPS32-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.olt.d $f12, $f0
+; MIPS32-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    cvt.s.d $f2, $f12
+; MIPS32-NEXT:    mfc1 $1, $f2
+; MIPS32-NEXT:    lui $2, 32768
+; MIPS32-NEXT:    xor $1, $1, $2
+; MIPS32-NEXT:    mov.d $f2, $f0
+; MIPS32-NEXT:    movz.d $f2, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f4
+; MIPS32-NEXT:    mtc1 $zero, $f5
+; MIPS32-NEXT:    c.eq.d $f0, $f4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.d $f0, $f2, $fcc0
+; MIPS32R5-LABEL: minimumnum_double:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.d $f0, $f14
+; MIPS32R5-NEXT:    c.un.d $f12, $f12
+; MIPS32R5-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.d $f14, $f14
+; MIPS32R5-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.olt.d $f12, $f0
+; MIPS32R5-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    cvt.s.d $f1, $f12
+; MIPS32R5-NEXT:    mfc1 $1, $f1
+; MIPS32R5-NEXT:    lui $2, 32768
+; MIPS32R5-NEXT:    xor $1, $1, $2
+; MIPS32R5-NEXT:    mov.d $f1, $f0
+; MIPS32R5-NEXT:    movz.d $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    mthc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.d $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.d $f0, $f1, $fcc0
   %z = call double @llvm.minimumnum.f64(double %x, double %y)
   ret double %z
 }
@@ -294,6 +873,49 @@ define double @minimumnum_double_nsz(double %x, double %y) {
 ; MIPS64R2-NEXT:    c.olt.d $f12, $f0
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
+;
+; MIPS64-LABEL: minimumnum_double_nsz:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.d $f0, $f13
+; MIPS64-NEXT:    c.un.d $f12, $f12
+; MIPS64-NEXT:    movt.d $f12, $f13, $fcc0
+; MIPS64-NEXT:    c.un.d $f13, $f13
+; MIPS64-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    c.olt.d $f12, $f0
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.d $f0, $f12, $fcc0
+;
+; MIPS32R2-LABEL: minimumnum_double_nsz:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.d $f0, $f14
+; MIPS32R2-NEXT:    c.un.d $f12, $f12
+; MIPS32R2-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R2-NEXT:    c.un.d $f14, $f14
+; MIPS32R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    c.olt.d $f12, $f0
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.d $f0, $f12, $fcc0
+;
+; MIPS32-LABEL: minimumnum_double_nsz:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.d $f0, $f14
+; MIPS32-NEXT:    c.un.d $f12, $f12
+; MIPS32-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32-NEXT:    c.un.d $f14, $f14
+; MIPS32-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    c.olt.d $f12, $f0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R5-LABEL: minimumnum_double_nsz:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.d $f0, $f14
+; MIPS32R5-NEXT:    c.un.d $f12, $f12
+; MIPS32R5-NEXT:    movt.d $f12, $f14, $fcc0
+; MIPS32R5-NEXT:    c.un.d $f14, $f14
+; MIPS32R5-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    c.olt.d $f12, $f0
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.d $f0, $f12, $fcc0
   %z = call nsz double @llvm.minimumnum.f64(double %x, double %y)
   ret double %z
 }
@@ -306,22 +928,85 @@ define double @minimumnum_double_nnan(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_double_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.olt.d $f12, $f13
 ; MIPS64R2-NEXT:    mov.d $f0, $f13
+; MIPS64R2-NEXT:    c.olt.d $f12, $f13
 ; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    daddiu $1, $zero, 1
 ; MIPS64R2-NEXT:    dsll $1, $1, 63
 ; MIPS64R2-NEXT:    dmfc1 $2, $f12
-; MIPS64R2-NEXT:    xor $2, $2, $1
-; MIPS64R2-NEXT:    mov.d $f1, $f0
-; MIPS64R2-NEXT:    movz.d $f1, $f12, $2
-; MIPS64R2-NEXT:    dmfc1 $2, $f13
 ; MIPS64R2-NEXT:    xor $1, $2, $1
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
+; MIPS64R2-NEXT:    mov.d $f1, $f0
+; MIPS64R2-NEXT:    movz.d $f1, $f12, $1
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS64-LABEL: minimumnum_double_nnan:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    mov.d $f0, $f13
+; MIPS64-NEXT:    c.olt.d $f12, $f13
+; MIPS64-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS64-NEXT:    daddiu $1, $zero, 1
+; MIPS64-NEXT:    dsll $1, $1, 63
+; MIPS64-NEXT:    dmfc1 $2, $f12
+; MIPS64-NEXT:    xor $1, $2, $1
+; MIPS64-NEXT:    mov.d $f1, $f0
+; MIPS64-NEXT:    movz.d $f1, $f12, $1
+; MIPS64-NEXT:    dmtc1 $zero, $f2
+; MIPS64-NEXT:    c.eq.d $f0, $f2
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    movt.d $f0, $f1, $fcc0
+;
+; MIPS32R2-LABEL: minimumnum_double_nnan:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.d $f0, $f14
+; MIPS32R2-NEXT:    c.olt.d $f12, $f14
+; MIPS32R2-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R2-NEXT:    cvt.s.d $f2, $f12
+; MIPS32R2-NEXT:    mfc1 $1, $f2
+; MIPS32R2-NEXT:    lui $2, 32768
+; MIPS32R2-NEXT:    xor $1, $1, $2
+; MIPS32R2-NEXT:    mov.d $f2, $f0
+; MIPS32R2-NEXT:    movz.d $f2, $f12, $1
+; MIPS32R2-NEXT:    mtc1 $zero, $f4
+; MIPS32R2-NEXT:    mthc1 $zero, $f4
+; MIPS32R2-NEXT:    c.eq.d $f0, $f4
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.d $f0, $f2, $fcc0
+;
+; MIPS32-LABEL: minimumnum_double_nnan:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    mov.d $f0, $f14
+; MIPS32-NEXT:    c.olt.d $f12, $f14
+; MIPS32-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32-NEXT:    cvt.s.d $f2, $f12
+; MIPS32-NEXT:    mfc1 $1, $f2
+; MIPS32-NEXT:    lui $2, 32768
+; MIPS32-NEXT:    xor $1, $1, $2
+; MIPS32-NEXT:    mov.d $f2, $f0
+; MIPS32-NEXT:    movz.d $f2, $f12, $1
+; MIPS32-NEXT:    mtc1 $zero, $f4
+; MIPS32-NEXT:    mtc1 $zero, $f5
+; MIPS32-NEXT:    c.eq.d $f0, $f4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    movt.d $f0, $f2, $fcc0
+; MIPS32R5-LABEL: minimumnum_double_nnan:
+; MIPS32R5:       # %bb.0:
+; MIPS32R5-NEXT:    mov.d $f0, $f14
+; MIPS32R5-NEXT:    c.olt.d $f12, $f14
+; MIPS32R5-NEXT:    movt.d $f0, $f12, $fcc0
+; MIPS32R5-NEXT:    cvt.s.d $f1, $f12
+; MIPS32R5-NEXT:    mfc1 $1, $f1
+; MIPS32R5-NEXT:    lui $2, 32768
+; MIPS32R5-NEXT:    xor $1, $1, $2
+; MIPS32R5-NEXT:    mov.d $f1, $f0
+; MIPS32R5-NEXT:    movz.d $f1, $f12, $1
+; MIPS32R5-NEXT:    mtc1 $zero, $f2
+; MIPS32R5-NEXT:    mthc1 $zero, $f2
+; MIPS32R5-NEXT:    c.eq.d $f0, $f2
+; MIPS32R5-NEXT:    jr $ra
+; MIPS32R5-NEXT:    movt.d $f0, $f1, $fcc0
   %z = call nnan double @llvm.minimumnum.f64(double %x, double %y)
   ret double %z
 }

>From 54f99adfabf4b51b7c17be5e3cfa24e4d884a792 Mon Sep 17 00:00:00 2001
From: YunQiang Su <wzssyqa at gmail.com>
Date: Wed, 3 Dec 2025 11:16:18 +0800
Subject: [PATCH 2/5] Update llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Co-authored-by: Nikita Popov <github at npopov.com>
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 15a20b54b5e07..237076a695699 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8870,7 +8870,7 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   EVT IntVT = VT.changeTypeToInteger();
   EVT FloatVT = VT.changeElementType(MVT::f32);
   SDValue LHSTrunc = LHS;
-  if (!isOperationLegal(ISD::BITCAST, IntVT) &&
+  if (!isTypeLegal(IntVT) &&
       !isOperationLegal(ISD::IS_FPCLASS, VT)) {
     LHSTrunc = DAG.getNode(ISD::FP_ROUND, DL, FloatVT, LHS,
                            DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));

>From 7049d73208f3658ccb5e4250d37b85a71285f6f3 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Wed, 3 Dec 2025 12:18:09 +0800
Subject: [PATCH 3/5] Add testcase for aarch64 and x86

---
 .../AArch64/fp-maximumnum-minimumnum.ll       | 124 ++++
 .../CodeGen/X86/fminimumnum-fmaximumnum.ll    | 660 ++++++++++++++----
 2 files changed, 656 insertions(+), 128 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index 4906e2e15e51c..9bdbf82b3a781 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -1076,6 +1076,68 @@ entry:
   ret <16 x half> %c
 }
 
+;;;;;;;;;;;;;;;;  max_f128
+define fp128 @max_fp128(fp128 %x, fp128 %y) {
+; AARCH64-LABEL: max_fp128:
+; AARCH64:       // %bb.0: // %start
+; AARCH64-NEXT:    sub sp, sp, #48
+; AARCH64-NEXT:    str x30, [sp, #32] // 8-byte Spill
+; AARCH64-NEXT:    .cfi_def_cfa_offset 48
+; AARCH64-NEXT:    .cfi_offset w30, -16
+; AARCH64-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:    bl __unordtf2
+; AARCH64-NEXT:    ldr q0, [sp, #16] // 16-byte Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.eq .LBB32_2
+; AARCH64-NEXT:  // %bb.1: // %start
+; AARCH64-NEXT:    str q0, [sp] // 16-byte Spill
+; AARCH64-NEXT:  .LBB32_2: // %start
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:    bl __unordtf2
+; AARCH64-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.eq .LBB32_4
+; AARCH64-NEXT:  // %bb.3: // %start
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:  .LBB32_4: // %start
+; AARCH64-NEXT:    ldr q0, [sp] // 16-byte Reload
+; AARCH64-NEXT:    str q1, [sp, #16] // 16-byte Spill
+; AARCH64-NEXT:    bl __gttf2
+; AARCH64-NEXT:    ldr q0, [sp] // 16-byte Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.le .LBB32_6
+; AARCH64-NEXT:  // %bb.5: // %start
+; AARCH64-NEXT:    str q0, [sp, #16] // 16-byte Spill
+; AARCH64-NEXT:  .LBB32_6: // %start
+; AARCH64-NEXT:    str q0, [sp] // 16-byte Spill
+; AARCH64-NEXT:    bl __trunctfsf2
+; AARCH64-NEXT:    fmov w8, s0
+; AARCH64-NEXT:    ldr q0, [sp, #16] // 16-byte Reload
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:    cmp w8, #0
+; AARCH64-NEXT:    b.ne .LBB32_8
+; AARCH64-NEXT:  // %bb.7: // %start
+; AARCH64-NEXT:    ldr q1, [sp] // 16-byte Reload
+; AARCH64-NEXT:  .LBB32_8: // %start
+; AARCH64-NEXT:    adrp x8, .LCPI32_0
+; AARCH64-NEXT:    str q1, [sp] // 16-byte Spill
+; AARCH64-NEXT:    ldr q1, [x8, :lo12:.LCPI32_0]
+; AARCH64-NEXT:    bl __eqtf2
+; AARCH64-NEXT:    ldr q0, [sp, #16] // 16-byte Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.ne .LBB32_10
+; AARCH64-NEXT:  // %bb.9: // %start
+; AARCH64-NEXT:    ldr q0, [sp] // 16-byte Reload
+; AARCH64-NEXT:  .LBB32_10: // %start
+; AARCH64-NEXT:    ldr x30, [sp, #32] // 8-byte Reload
+; AARCH64-NEXT:    add sp, sp, #48
+; AARCH64-NEXT:    ret
+start:
+  %0 = tail call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %0
+}
+
 ;;;;;;;;;;;;;;;;  max_f64
 define double @max_f64(double %a, double %b) {
 ; AARCH64-LABEL: max_f64:
@@ -1658,6 +1720,68 @@ entry:
   ret <16 x half> %c
 }
 
+;;;;;;;;;;;;;;;;  min_f128
+define fp128 @min_fp128(fp128 %x, fp128 %y) {
+; AARCH64-LABEL: min_fp128:
+; AARCH64:       // %bb.0: // %start
+; AARCH64-NEXT:    sub sp, sp, #48
+; AARCH64-NEXT:    str x30, [sp, #32] // 8-byte Spill
+; AARCH64-NEXT:    .cfi_def_cfa_offset 48
+; AARCH64-NEXT:    .cfi_offset w30, -16
+; AARCH64-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:    bl __unordtf2
+; AARCH64-NEXT:    ldr q0, [sp, #16] // 16-byte Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.eq .LBB49_2
+; AARCH64-NEXT:  // %bb.1: // %start
+; AARCH64-NEXT:    str q0, [sp] // 16-byte Spill
+; AARCH64-NEXT:  .LBB49_2: // %start
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:    bl __unordtf2
+; AARCH64-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.eq .LBB49_4
+; AARCH64-NEXT:  // %bb.3: // %start
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:  .LBB49_4: // %start
+; AARCH64-NEXT:    ldr q0, [sp] // 16-byte Reload
+; AARCH64-NEXT:    str q1, [sp, #16] // 16-byte Spill
+; AARCH64-NEXT:    bl __gttf2
+; AARCH64-NEXT:    ldr q0, [sp] // 16-byte Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.le .LBB49_6
+; AARCH64-NEXT:  // %bb.5: // %start
+; AARCH64-NEXT:    str q0, [sp, #16] // 16-byte Spill
+; AARCH64-NEXT:  .LBB49_6: // %start
+; AARCH64-NEXT:    str q0, [sp] // 16-byte Spill
+; AARCH64-NEXT:    bl __trunctfsf2
+; AARCH64-NEXT:    fmov w8, s0
+; AARCH64-NEXT:    ldr q0, [sp, #16] // 16-byte Reload
+; AARCH64-NEXT:    mov v1.16b, v0.16b
+; AARCH64-NEXT:    cmp w8, #0
+; AARCH64-NEXT:    b.ne .LBB49_8
+; AARCH64-NEXT:  // %bb.7: // %start
+; AARCH64-NEXT:    ldr q1, [sp] // 16-byte Reload
+; AARCH64-NEXT:  .LBB49_8: // %start
+; AARCH64-NEXT:    adrp x8, .LCPI49_0
+; AARCH64-NEXT:    str q1, [sp] // 16-byte Spill
+; AARCH64-NEXT:    ldr q1, [x8, :lo12:.LCPI49_0]
+; AARCH64-NEXT:    bl __eqtf2
+; AARCH64-NEXT:    ldr q0, [sp, #16] // 16-byte Reload
+; AARCH64-NEXT:    cmp w0, #0
+; AARCH64-NEXT:    b.ne .LBB49_10
+; AARCH64-NEXT:  // %bb.9: // %start
+; AARCH64-NEXT:    ldr q0, [sp] // 16-byte Reload
+; AARCH64-NEXT:  .LBB49_10: // %start
+; AARCH64-NEXT:    ldr x30, [sp, #32] // 8-byte Reload
+; AARCH64-NEXT:    add sp, sp, #48
+; AARCH64-NEXT:    ret
+start:
+  %0 = tail call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %0
+}
+
 ;;;;;;;;;;;;;;;;  min_f64
 define double @min_f64(double %a, double %b) {
 ; AARCH64-LABEL: min_f64:
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index aae6cda4458d2..9a745b6ca62c3 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -8,8 +8,10 @@
 
 declare float @llvm.maximumnum.f32(float, float)
 declare double @llvm.maximumnum.f64(double, double)
+declare fp128 @llvm.maximumnum.f128(fp128, fp128)
 declare float @llvm.minimumnum.f32(float, float)
 declare double @llvm.minimumnum.f64(double, double)
+declare fp128 @llvm.minimumnum.f128(fp128, fp128)
 declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
 declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
 declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
@@ -515,6 +517,207 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
   ret float %2
 }
 
+define fp128 @test_fmaximumnum_fp128(fp128 %x, fp128 %y) {
+; SSE2-LABEL: test_fmaximumnum_fp128:
+; SSE2:       # %bb.0: # %start
+; SSE2-NEXT:    subq $40, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    callq __unordtf2 at PLT
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    jne .LBB10_2
+; SSE2-NEXT:  # %bb.1: # %start
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB10_2: # %start
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    callq __unordtf2 at PLT
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    jne .LBB10_4
+; SSE2-NEXT:  # %bb.3: # %start
+; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB10_4: # %start
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    callq __gttf2 at PLT
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    jg .LBB10_6
+; SSE2-NEXT:  # %bb.5: # %start
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB10_6: # %start
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    callq __trunctfsf2 at PLT
+; SSE2-NEXT:    movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    je .LBB10_8
+; SSE2-NEXT:  # %bb.7: # %start
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:  .LBB10_8: # %start
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    callq __eqtf2 at PLT
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    je .LBB10_10
+; SSE2-NEXT:  # %bb.9: # %start
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:  .LBB10_10: # %start
+; SSE2-NEXT:    addq $40, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_fp128:
+; AVX:       # %bb.0: # %start
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    callq __unordtf2 at PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    jne .LBB10_2
+; AVX-NEXT:  # %bb.1: # %start
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB10_2: # %start
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    callq __unordtf2 at PLT
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    jne .LBB10_4
+; AVX-NEXT:  # %bb.3: # %start
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB10_4: # %start
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __gttf2 at PLT
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX-NEXT:    jg .LBB10_6
+; AVX-NEXT:  # %bb.5: # %start
+; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB10_6: # %start
+; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __trunctfsf2 at PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    je .LBB10_8
+; AVX-NEXT:  # %bb.7: # %start
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:  .LBB10_8: # %start
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    callq __eqtf2 at PLT
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    je .LBB10_10
+; AVX-NEXT:  # %bb.9: # %start
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:  .LBB10_10: # %start
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_fp128:
+; AVX10_2:       # %bb.0: # %start
+; AVX10_2-NEXT:    subq $40, %rsp
+; AVX10_2-NEXT:    .cfi_def_cfa_offset 48
+; AVX10_2-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    callq __unordtf2 at PLT
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    jne .LBB10_2
+; AVX10_2-NEXT:  # %bb.1: # %start
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB10_2: # %start
+; AVX10_2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    callq __unordtf2 at PLT
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:    jne .LBB10_4
+; AVX10_2-NEXT:  # %bb.3: # %start
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB10_4: # %start
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    callq __gttf2 at PLT
+; AVX10_2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX10_2-NEXT:    jg .LBB10_6
+; AVX10_2-NEXT:  # %bb.5: # %start
+; AVX10_2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB10_6: # %start
+; AVX10_2-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    callq __trunctfsf2 at PLT
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX10_2-NEXT:    vmovd %xmm0, %eax
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    je .LBB10_8
+; AVX10_2-NEXT:  # %bb.7: # %start
+; AVX10_2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:  .LBB10_8: # %start
+; AVX10_2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX10_2-NEXT:    callq __eqtf2 at PLT
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    je .LBB10_10
+; AVX10_2-NEXT:  # %bb.9: # %start
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB10_10: # %start
+; AVX10_2-NEXT:    addq $40, %rsp
+; AVX10_2-NEXT:    .cfi_def_cfa_offset 8
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_fp128:
+; X86:       # %bb.0: # %start
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    vmovups 24(%ebp), %ymm0
+; X86-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll fmaximum_numl
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl $4
+start:
+  %0 = tail call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %0
+}
+
 ;
 ; fminimumnum
 ;
@@ -524,14 +727,14 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    js .LBB10_1
+; SSE2-NEXT:    js .LBB11_1
 ; SSE2-NEXT:  # %bb.2:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    jmp .LBB10_3
-; SSE2-NEXT:  .LBB10_1:
+; SSE2-NEXT:    jmp .LBB11_3
+; SSE2-NEXT:  .LBB11_1:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:  .LBB10_3:
+; SSE2-NEXT:  .LBB11_3:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    minss %xmm2, %xmm3
 ; SSE2-NEXT:    movaps %xmm3, %xmm1
@@ -547,14 +750,14 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
-; AVX1-NEXT:    js .LBB10_1
+; AVX1-NEXT:    js .LBB11_1
 ; AVX1-NEXT:  # %bb.2:
 ; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX1-NEXT:    jmp .LBB10_3
-; AVX1-NEXT:  .LBB10_1:
+; AVX1-NEXT:    jmp .LBB11_3
+; AVX1-NEXT:  .LBB11_1:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX1-NEXT:  .LBB10_3:
+; AVX1-NEXT:  .LBB11_3:
 ; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
 ; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -587,14 +790,14 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    js .LBB10_1
+; X86-NEXT:    js .LBB11_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    vmovdqa %xmm1, %xmm2
-; X86-NEXT:    jmp .LBB10_3
-; X86-NEXT:  .LBB10_1:
+; X86-NEXT:    jmp .LBB11_3
+; X86-NEXT:  .LBB11_1:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm2
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:  .LBB10_3:
+; X86-NEXT:  .LBB11_3:
 ; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -680,11 +883,11 @@ define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="tr
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB14_1
+; SSE2-NEXT:    js .LBB15_1
 ; SSE2-NEXT:  # %bb.2:
 ; SSE2-NEXT:    minsd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
-; SSE2-NEXT:  .LBB14_1:
+; SSE2-NEXT:  .LBB15_1:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    minsd %xmm2, %xmm0
@@ -694,11 +897,11 @@ define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="tr
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB14_1
+; AVX1-NEXT:    js .LBB15_1
 ; AVX1-NEXT:  # %bb.2:
 ; AVX1-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB14_1:
+; AVX1-NEXT:  .LBB15_1:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:    vminsd %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
@@ -740,14 +943,14 @@ define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="tr
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vextractps $1, %xmm0, %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    js .LBB14_1
+; X86-NEXT:    js .LBB15_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    vmovapd %xmm1, %xmm2
-; X86-NEXT:    jmp .LBB14_3
-; X86-NEXT:  .LBB14_1:
+; X86-NEXT:    jmp .LBB15_3
+; X86-NEXT:  .LBB15_1:
 ; X86-NEXT:    vmovapd %xmm0, %xmm2
 ; X86-NEXT:    vmovapd %xmm1, %xmm0
-; X86-NEXT:  .LBB14_3:
+; X86-NEXT:  .LBB15_3:
 ; X86-NEXT:    vminsd %xmm2, %xmm0, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
@@ -903,14 +1106,14 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; SSE2-NEXT:    divss %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    js .LBB19_1
+; SSE2-NEXT:    js .LBB20_1
 ; SSE2-NEXT:  # %bb.2:
 ; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    jmp .LBB19_3
-; SSE2-NEXT:  .LBB19_1:
+; SSE2-NEXT:    jmp .LBB20_3
+; SSE2-NEXT:  .LBB20_1:
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:  .LBB19_3:
+; SSE2-NEXT:  .LBB20_3:
 ; SSE2-NEXT:    movaps %xmm0, %xmm3
 ; SSE2-NEXT:    minss %xmm2, %xmm3
 ; SSE2-NEXT:    movaps %xmm3, %xmm1
@@ -927,14 +1130,14 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm2
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
-; AVX1-NEXT:    js .LBB19_1
+; AVX1-NEXT:    js .LBB20_1
 ; AVX1-NEXT:  # %bb.2:
 ; AVX1-NEXT:    vmovaps %xmm2, %xmm1
-; AVX1-NEXT:    jmp .LBB19_3
-; AVX1-NEXT:  .LBB19_1:
+; AVX1-NEXT:    jmp .LBB20_3
+; AVX1-NEXT:  .LBB20_1:
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovaps %xmm2, %xmm0
-; AVX1-NEXT:  .LBB19_3:
+; AVX1-NEXT:  .LBB20_3:
 ; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -981,14 +1184,14 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; X86-NEXT:    vdivss %xmm0, %xmm1, %xmm2
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    js .LBB19_1
+; X86-NEXT:    js .LBB20_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    vmovaps %xmm2, %xmm1
-; X86-NEXT:    jmp .LBB19_3
-; X86-NEXT:  .LBB19_1:
+; X86-NEXT:    jmp .LBB20_3
+; X86-NEXT:  .LBB20_1:
 ; X86-NEXT:    vmovaps %xmm0, %xmm1
 ; X86-NEXT:    vmovaps %xmm2, %xmm0
-; X86-NEXT:  .LBB19_3:
+; X86-NEXT:  .LBB20_3:
 ; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1526,15 +1729,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    js .LBB33_1
+; SSE2-NEXT:    js .LBB34_1
 ; SSE2-NEXT:  # %bb.2:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    jmp .LBB33_3
-; SSE2-NEXT:  .LBB33_1:
+; SSE2-NEXT:    jmp .LBB34_3
+; SSE2-NEXT:  .LBB34_1:
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:  .LBB33_3:
+; SSE2-NEXT:  .LBB34_3:
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    psrlq $48, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1560,15 +1763,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    js .LBB33_4
+; SSE2-NEXT:    js .LBB34_4
 ; SSE2-NEXT:  # %bb.5:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    jmp .LBB33_6
-; SSE2-NEXT:  .LBB33_4:
+; SSE2-NEXT:    jmp .LBB34_6
+; SSE2-NEXT:  .LBB34_4:
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:  .LBB33_6:
+; SSE2-NEXT:  .LBB34_6:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    maxss %xmm1, %xmm2
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -1586,15 +1789,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    js .LBB33_7
+; SSE2-NEXT:    js .LBB34_7
 ; SSE2-NEXT:  # %bb.8:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    jmp .LBB33_9
-; SSE2-NEXT:  .LBB33_7:
+; SSE2-NEXT:    jmp .LBB34_9
+; SSE2-NEXT:  .LBB34_7:
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:  .LBB33_9:
+; SSE2-NEXT:  .LBB34_9:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    maxss %xmm1, %xmm2
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -1612,15 +1815,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    js .LBB33_10
+; SSE2-NEXT:    js .LBB34_10
 ; SSE2-NEXT:  # %bb.11:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    jmp .LBB33_12
-; SSE2-NEXT:  .LBB33_10:
+; SSE2-NEXT:    jmp .LBB34_12
+; SSE2-NEXT:  .LBB34_10:
 ; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:  .LBB33_12:
+; SSE2-NEXT:  .LBB34_12:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    maxss %xmm1, %xmm2
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -1658,15 +1861,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    callq __extendhfsf2 at PLT
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
-; AVX1-NEXT:    js .LBB33_1
+; AVX1-NEXT:    js .LBB34_1
 ; AVX1-NEXT:  # %bb.2:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    jmp .LBB33_3
-; AVX1-NEXT:  .LBB33_1:
+; AVX1-NEXT:    jmp .LBB34_3
+; AVX1-NEXT:  .LBB34_1:
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
-; AVX1-NEXT:  .LBB33_3:
+; AVX1-NEXT:  .LBB34_3:
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1685,15 +1888,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    callq __extendhfsf2 at PLT
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
-; AVX1-NEXT:    js .LBB33_4
+; AVX1-NEXT:    js .LBB34_4
 ; AVX1-NEXT:  # %bb.5:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    jmp .LBB33_6
-; AVX1-NEXT:  .LBB33_4:
+; AVX1-NEXT:    jmp .LBB34_6
+; AVX1-NEXT:  .LBB34_4:
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
-; AVX1-NEXT:  .LBB33_6:
+; AVX1-NEXT:  .LBB34_6:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
@@ -1706,15 +1909,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    callq __extendhfsf2 at PLT
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
-; AVX1-NEXT:    js .LBB33_7
+; AVX1-NEXT:    js .LBB34_7
 ; AVX1-NEXT:  # %bb.8:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    jmp .LBB33_9
-; AVX1-NEXT:  .LBB33_7:
+; AVX1-NEXT:    jmp .LBB34_9
+; AVX1-NEXT:  .LBB34_7:
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
-; AVX1-NEXT:  .LBB33_9:
+; AVX1-NEXT:  .LBB34_9:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
@@ -1727,15 +1930,15 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    callq __extendhfsf2 at PLT
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
-; AVX1-NEXT:    js .LBB33_10
+; AVX1-NEXT:    js .LBB34_10
 ; AVX1-NEXT:  # %bb.11:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    jmp .LBB33_12
-; AVX1-NEXT:  .LBB33_10:
+; AVX1-NEXT:    jmp .LBB34_12
+; AVX1-NEXT:  .LBB34_10:
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
-; AVX1-NEXT:  .LBB33_12:
+; AVX1-NEXT:  .LBB34_12:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
@@ -1919,14 +2122,14 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovd %xmm1, %eax
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    js .LBB33_1
+; X86-NEXT:    js .LBB34_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    vmovdqa %xmm1, %xmm2
-; X86-NEXT:    jmp .LBB33_3
-; X86-NEXT:  .LBB33_1:
+; X86-NEXT:    jmp .LBB34_3
+; X86-NEXT:  .LBB34_1:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm2
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:  .LBB33_3:
+; X86-NEXT:  .LBB34_3:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1941,14 +2144,14 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovd %xmm1, %eax
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    js .LBB33_4
+; X86-NEXT:    js .LBB34_4
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    vmovdqa %xmm1, %xmm2
-; X86-NEXT:    jmp .LBB33_6
-; X86-NEXT:  .LBB33_4:
+; X86-NEXT:    jmp .LBB34_6
+; X86-NEXT:  .LBB34_4:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm2
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:  .LBB33_6:
+; X86-NEXT:  .LBB34_6:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1979,14 +2182,14 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovd %xmm1, %eax
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    js .LBB33_7
+; X86-NEXT:    js .LBB34_7
 ; X86-NEXT:  # %bb.8:
 ; X86-NEXT:    vmovdqa %xmm1, %xmm2
-; X86-NEXT:    jmp .LBB33_9
-; X86-NEXT:  .LBB33_7:
+; X86-NEXT:    jmp .LBB34_9
+; X86-NEXT:  .LBB34_7:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm2
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:  .LBB33_9:
+; X86-NEXT:  .LBB34_9:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2001,14 +2204,14 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovd %xmm1, %eax
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    js .LBB33_10
+; X86-NEXT:    js .LBB34_10
 ; X86-NEXT:  # %bb.11:
 ; X86-NEXT:    vmovdqa %xmm1, %xmm2
-; X86-NEXT:    jmp .LBB33_12
-; X86-NEXT:  .LBB33_10:
+; X86-NEXT:    jmp .LBB34_12
+; X86-NEXT:  .LBB34_10:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm2
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
-; X86-NEXT:  .LBB33_12:
+; X86-NEXT:  .LBB34_12:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2060,14 +2263,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    shll $16, %eax
 ; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    js .LBB34_1
+; SSE2-NEXT:    js .LBB35_1
 ; SSE2-NEXT:  # %bb.2:
 ; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    jmp .LBB34_3
-; SSE2-NEXT:  .LBB34_1:
+; SSE2-NEXT:    jmp .LBB35_3
+; SSE2-NEXT:  .LBB35_1:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:  .LBB34_3:
+; SSE2-NEXT:  .LBB35_3:
 ; SSE2-NEXT:    pextrw $0, %xmm2, %ebx
 ; SSE2-NEXT:    pextrw $0, %xmm3, %r14d
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -2084,14 +2287,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    movd %r13d, %xmm1
 ; SSE2-NEXT:    shll $16, %r12d
 ; SSE2-NEXT:    movd %r12d, %xmm2
-; SSE2-NEXT:    js .LBB34_4
+; SSE2-NEXT:    js .LBB35_4
 ; SSE2-NEXT:  # %bb.5:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    jmp .LBB34_6
-; SSE2-NEXT:  .LBB34_4:
+; SSE2-NEXT:    jmp .LBB35_6
+; SSE2-NEXT:  .LBB35_4:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:  .LBB34_6:
+; SSE2-NEXT:  .LBB35_6:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    maxss %xmm0, %xmm2
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -2106,14 +2309,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    movd %r15d, %xmm1
 ; SSE2-NEXT:    shll $16, %ebp
 ; SSE2-NEXT:    movd %ebp, %xmm2
-; SSE2-NEXT:    js .LBB34_7
+; SSE2-NEXT:    js .LBB35_7
 ; SSE2-NEXT:  # %bb.8:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    jmp .LBB34_9
-; SSE2-NEXT:  .LBB34_7:
+; SSE2-NEXT:    jmp .LBB35_9
+; SSE2-NEXT:  .LBB35_7:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:  .LBB34_9:
+; SSE2-NEXT:  .LBB35_9:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    maxss %xmm0, %xmm2
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -2128,14 +2331,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    movd %r14d, %xmm1
 ; SSE2-NEXT:    shll $16, %ebx
 ; SSE2-NEXT:    movd %ebx, %xmm2
-; SSE2-NEXT:    js .LBB34_10
+; SSE2-NEXT:    js .LBB35_10
 ; SSE2-NEXT:  # %bb.11:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    jmp .LBB34_12
-; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    jmp .LBB35_12
+; SSE2-NEXT:  .LBB35_10:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:  .LBB34_12:
+; SSE2-NEXT:  .LBB35_12:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    maxss %xmm0, %xmm2
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -2185,14 +2388,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovd %ecx, %xmm0
 ; AVX1-NEXT:    shll $16, %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm4
-; AVX1-NEXT:    js .LBB34_1
+; AVX1-NEXT:    js .LBB35_1
 ; AVX1-NEXT:  # %bb.2:
 ; AVX1-NEXT:    vmovdqa %xmm4, %xmm1
-; AVX1-NEXT:    jmp .LBB34_3
-; AVX1-NEXT:  .LBB34_1:
+; AVX1-NEXT:    jmp .LBB35_3
+; AVX1-NEXT:  .LBB35_1:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm4, %xmm0
-; AVX1-NEXT:  .LBB34_3:
+; AVX1-NEXT:  .LBB35_3:
 ; AVX1-NEXT:    vpextrw $0, %xmm2, %ebp
 ; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
@@ -2204,14 +2407,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovd %r13d, %xmm0
 ; AVX1-NEXT:    shll $16, %r12d
 ; AVX1-NEXT:    vmovd %r12d, %xmm2
-; AVX1-NEXT:    js .LBB34_4
+; AVX1-NEXT:    js .LBB35_4
 ; AVX1-NEXT:  # %bb.5:
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
-; AVX1-NEXT:    jmp .LBB34_6
-; AVX1-NEXT:  .LBB34_4:
+; AVX1-NEXT:    jmp .LBB35_6
+; AVX1-NEXT:  .LBB35_4:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
-; AVX1-NEXT:  .LBB34_6:
+; AVX1-NEXT:  .LBB35_6:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2221,14 +2424,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovd %r14d, %xmm0
 ; AVX1-NEXT:    shll $16, %ebx
 ; AVX1-NEXT:    vmovd %ebx, %xmm2
-; AVX1-NEXT:    js .LBB34_7
+; AVX1-NEXT:    js .LBB35_7
 ; AVX1-NEXT:  # %bb.8:
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
-; AVX1-NEXT:    jmp .LBB34_9
-; AVX1-NEXT:  .LBB34_7:
+; AVX1-NEXT:    jmp .LBB35_9
+; AVX1-NEXT:  .LBB35_7:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
-; AVX1-NEXT:  .LBB34_9:
+; AVX1-NEXT:  .LBB35_9:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2238,14 +2441,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovd %r15d, %xmm0
 ; AVX1-NEXT:    shll $16, %ebp
 ; AVX1-NEXT:    vmovd %ebp, %xmm2
-; AVX1-NEXT:    js .LBB34_10
+; AVX1-NEXT:    js .LBB35_10
 ; AVX1-NEXT:  # %bb.11:
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
-; AVX1-NEXT:    jmp .LBB34_12
-; AVX1-NEXT:  .LBB34_10:
+; AVX1-NEXT:    jmp .LBB35_12
+; AVX1-NEXT:  .LBB35_10:
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
-; AVX1-NEXT:  .LBB34_12:
+; AVX1-NEXT:  .LBB35_12:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2380,14 +2583,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovd %edi, %xmm0
 ; X86-NEXT:    shll $16, %edx
 ; X86-NEXT:    vmovd %edx, %xmm4
-; X86-NEXT:    js .LBB34_1
+; X86-NEXT:    js .LBB35_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    vmovdqa %xmm4, %xmm1
-; X86-NEXT:    jmp .LBB34_3
-; X86-NEXT:  .LBB34_1:
+; X86-NEXT:    jmp .LBB35_3
+; X86-NEXT:  .LBB35_1:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm1
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0
-; X86-NEXT:  .LBB34_3:
+; X86-NEXT:  .LBB35_3:
 ; X86-NEXT:    vpextrw $0, %xmm2, %edi
 ; X86-NEXT:    vpextrw $0, %xmm3, %ebp
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
@@ -2398,14 +2601,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovd %ecx, %xmm0
 ; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    vmovd %eax, %xmm2
-; X86-NEXT:    js .LBB34_4
+; X86-NEXT:    js .LBB35_4
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    vmovdqa %xmm2, %xmm1
-; X86-NEXT:    jmp .LBB34_6
-; X86-NEXT:  .LBB34_4:
+; X86-NEXT:    jmp .LBB35_6
+; X86-NEXT:  .LBB35_4:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm1
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
-; X86-NEXT:  .LBB34_6:
+; X86-NEXT:  .LBB35_6:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2418,14 +2621,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovd %ebx, %xmm0
 ; X86-NEXT:    shll $16, %esi
 ; X86-NEXT:    vmovd %esi, %xmm2
-; X86-NEXT:    js .LBB34_7
+; X86-NEXT:    js .LBB35_7
 ; X86-NEXT:  # %bb.8:
 ; X86-NEXT:    vmovdqa %xmm2, %xmm1
-; X86-NEXT:    jmp .LBB34_9
-; X86-NEXT:  .LBB34_7:
+; X86-NEXT:    jmp .LBB35_9
+; X86-NEXT:  .LBB35_7:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm1
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
-; X86-NEXT:  .LBB34_9:
+; X86-NEXT:  .LBB35_9:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2438,14 +2641,14 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovd %ebp, %xmm0
 ; X86-NEXT:    shll $16, %edi
 ; X86-NEXT:    vmovd %edi, %xmm2
-; X86-NEXT:    js .LBB34_10
+; X86-NEXT:    js .LBB35_10
 ; X86-NEXT:  # %bb.11:
 ; X86-NEXT:    vmovdqa %xmm2, %xmm1
-; X86-NEXT:    jmp .LBB34_12
-; X86-NEXT:  .LBB34_10:
+; X86-NEXT:    jmp .LBB35_12
+; X86-NEXT:  .LBB35_10:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm1
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
-; X86-NEXT:  .LBB34_12:
+; X86-NEXT:  .LBB35_12:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
 ; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -2569,3 +2772,204 @@ define float @test_fminimumnum_snan(float %x, float %y) {
   %1 = tail call float @llvm.minimumnum.f32(float 0x7ff4000000000000, float %y)
   ret float %1
 }
+
+define fp128 @test_fminimumnum_fp128(fp128 %x, fp128 %y) {
+; SSE2-LABEL: test_fminimumnum_fp128:
+; SSE2:       # %bb.0: # %start
+; SSE2-NEXT:    subq $40, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    callq __unordtf2 at PLT
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    jne .LBB40_2
+; SSE2-NEXT:  # %bb.1: # %start
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB40_2: # %start
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    callq __unordtf2 at PLT
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    jne .LBB40_4
+; SSE2-NEXT:  # %bb.3: # %start
+; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB40_4: # %start
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    callq __lttf2 at PLT
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    js .LBB40_6
+; SSE2-NEXT:  # %bb.5: # %start
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB40_6: # %start
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    callq __trunctfsf2 at PLT
+; SSE2-NEXT:    movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    jo .LBB40_8
+; SSE2-NEXT:  # %bb.7: # %start
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:  .LBB40_8: # %start
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    callq __eqtf2 at PLT
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    je .LBB40_10
+; SSE2-NEXT:  # %bb.9: # %start
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:  .LBB40_10: # %start
+; SSE2-NEXT:    addq $40, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_fp128:
+; AVX:       # %bb.0: # %start
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    callq __unordtf2 at PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    jne .LBB40_2
+; AVX-NEXT:  # %bb.1: # %start
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB40_2: # %start
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    callq __unordtf2 at PLT
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    jne .LBB40_4
+; AVX-NEXT:  # %bb.3: # %start
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB40_4: # %start
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __lttf2 at PLT
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX-NEXT:    js .LBB40_6
+; AVX-NEXT:  # %bb.5: # %start
+; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB40_6: # %start
+; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __trunctfsf2 at PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    jo .LBB40_8
+; AVX-NEXT:  # %bb.7: # %start
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:  .LBB40_8: # %start
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    callq __eqtf2 at PLT
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    je .LBB40_10
+; AVX-NEXT:  # %bb.9: # %start
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:  .LBB40_10: # %start
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_fp128:
+; AVX10_2:       # %bb.0: # %start
+; AVX10_2-NEXT:    subq $40, %rsp
+; AVX10_2-NEXT:    .cfi_def_cfa_offset 48
+; AVX10_2-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    callq __unordtf2 at PLT
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    jne .LBB40_2
+; AVX10_2-NEXT:  # %bb.1: # %start
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB40_2: # %start
+; AVX10_2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    callq __unordtf2 at PLT
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:    jne .LBB40_4
+; AVX10_2-NEXT:  # %bb.3: # %start
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB40_4: # %start
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    callq __lttf2 at PLT
+; AVX10_2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX10_2-NEXT:    js .LBB40_6
+; AVX10_2-NEXT:  # %bb.5: # %start
+; AVX10_2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB40_6: # %start
+; AVX10_2-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    callq __trunctfsf2 at PLT
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX10_2-NEXT:    vmovd %xmm0, %eax
+; AVX10_2-NEXT:    negl %eax
+; AVX10_2-NEXT:    jo .LBB40_8
+; AVX10_2-NEXT:  # %bb.7: # %start
+; AVX10_2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:  .LBB40_8: # %start
+; AVX10_2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX10_2-NEXT:    callq __eqtf2 at PLT
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    je .LBB40_10
+; AVX10_2-NEXT:  # %bb.9: # %start
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB40_10: # %start
+; AVX10_2-NEXT:    addq $40, %rsp
+; AVX10_2-NEXT:    .cfi_def_cfa_offset 8
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_fp128:
+; X86:       # %bb.0: # %start
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    vmovups 24(%ebp), %ymm0
+; X86-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll fminimum_numl
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl $4
+start:
+  %0 = tail call fp128 @llvm.minimumnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %0
+}

>From fc89739ff605657280ed28aa18b9309c0253c22f Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Wed, 3 Dec 2025 12:22:58 +0800
Subject: [PATCH 4/5] Fix code style

---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 237076a695699..e2bd0bfba4e3f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8870,8 +8870,7 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   EVT IntVT = VT.changeTypeToInteger();
   EVT FloatVT = VT.changeElementType(MVT::f32);
   SDValue LHSTrunc = LHS;
-  if (!isTypeLegal(IntVT) &&
-      !isOperationLegal(ISD::IS_FPCLASS, VT)) {
+  if (!isTypeLegal(IntVT) && !isOperationLegal(ISD::IS_FPCLASS, VT)) {
     LHSTrunc = DAG.getNode(ISD::FP_ROUND, DL, FloatVT, LHS,
                            DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
   }

>From d06323b1cbbbfbff25c07b06f83c894135bf8740 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Wed, 3 Dec 2025 12:39:56 +0800
Subject: [PATCH 5/5] Update testcase

---
 llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll |   567 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll  | 13846 ++++++++---------
 llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll  | 13132 +++++++---------
 3 files changed, 12497 insertions(+), 15048 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index ca660a9eec137..6b99b06e155fb 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -1859,87 +1859,76 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v3.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
 ; GFX11-SDAG-FAKE16:       ; %bb.0:
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -1959,49 +1948,37 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v1.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v3.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2012,58 +1989,51 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call bfloat @llvm.minimumnum.bf16(bfloat %a, bfloat %b)
   %min3 = call bfloat @llvm.minimumnum.bf16(bfloat %tmp0, bfloat %c)
@@ -2412,87 +2382,68 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v4, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v3.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v5.h, s2
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v5.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v2.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2581,102 +2532,81 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v4, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v5.h, s2
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v5.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v2.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v2.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2692,102 +2622,87 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v5, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index f944686a96cd7..ccb807695e270 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -119,17 +119,13 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_bf16:
@@ -179,18 +175,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_bf16:
@@ -212,21 +202,17 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -312,13 +298,10 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_bf16_nnan:
@@ -353,16 +336,11 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_bf16_nnan:
@@ -374,21 +352,17 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -572,45 +546,37 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16:
@@ -665,55 +631,43 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16:
@@ -731,50 +685,44 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
   ret <2 x bfloat> %result
@@ -905,61 +853,51 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v6 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -973,35 +911,27 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v4, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
@@ -1011,40 +941,32 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v6 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1299,66 +1221,53 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1428,79 +1337,61 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1513,75 +1404,66 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -1761,84 +1643,70 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v10, v9 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v9 :: v_dual_lshlrev_b32 v7, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -1850,49 +1718,36 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1903,57 +1758,43 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v10, v9 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v9 :: v_dual_lshlrev_b32 v7, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -2287,82 +2128,69 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -2456,97 +2284,80 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -2559,100 +2370,91 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
@@ -2887,108 +2689,94 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v6, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v9, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3001,63 +2789,47 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v6, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3067,73 +2839,61 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v9, v8
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v8, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
@@ -3620,129 +3380,110 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v2.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v5.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, s0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v10.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v11.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v1, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v9.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.h, v4.h, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v4.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v0.h, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v8.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_maximumnum_v6bf16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.l, v10.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v10.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v8.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v10.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v4.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v3.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v7
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v6bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
@@ -3863,141 +3604,118 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v2.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v5.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v10.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v11.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.h, v4.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v4.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v12.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v1, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v9.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v0, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v8.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v0.h, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.l, v10.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v10.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v8.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v10.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v4.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v3.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v7
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v6bf16:
@@ -4011,145 +3729,132 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v15, v14 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v14
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v14, v10 :: v_dual_lshlrev_b32 v15, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v14
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v12, v10, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v9, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v13, v12
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v15, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v10, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4788,311 +4493,279 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v13.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v7.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v2.h, v6.h, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v6.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v2, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v12.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v1, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v12.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v1.h, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v0, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v5.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v0.h, v4.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v11.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v14.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v15.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v12.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.l, v13.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v13.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v11.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v13.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v6.l, v2.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v11.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v11.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v4.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v11.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v8bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v12 :: v_dual_and_b32 v11, 0xffff0000, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v10 :: v_dual_lshlrev_b32 v12, 16, v9
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v12, 0xffff0000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v18, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v9 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v17, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v11, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v13
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s3, s4
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v8bf16:
@@ -5104,186 +4777,153 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v11.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v11.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v7.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v2.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v6.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v13.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v1.h, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v5.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v0.h, v4.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v2.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v2, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v12.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v1, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v12.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v0, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v12.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v11.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v14.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v15.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v11.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v11.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v12.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.l, v13.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v13.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v12, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v11.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v13.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v6.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v11.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v11.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v4.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v10
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v11.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v8
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v8bf16:
@@ -5293,199 +4933,178 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v12 :: v_dual_and_b32 v11, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v10 :: v_dual_lshlrev_b32 v12, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v12, 0xffff0000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v13
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v18, v19
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v9 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v17, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v5
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v11, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v13
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s3, s4
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s1
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y)
   ret <8 x bfloat> %result
@@ -6454,920 +6073,839 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v19
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v21, v22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v20, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v22, v21, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v24, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v21, v20, s4
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v24, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v23, v22, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v17, v20, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v26, v26
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v21, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v25, v24, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v24, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v18, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v19, v25, s5
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v27, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v29, v28, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v28, v27, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v27
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v27
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v20, v23
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v19, v27, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v26, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v21, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v26
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v29, v28, s7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v28, v26, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v21, v24
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v27, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v22, v26, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v26, v23, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v29, v28, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v28, v22, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v28, v28
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v26
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v25, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v24, v22, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v28, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v26, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s5
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v27, v25
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v29, v29
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v27, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v25, v25
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v5
+; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v26, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v5, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v25, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX10-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v11
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v25, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v11
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s7
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v27, v26
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v15, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s6
+; GFX10-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX10-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v15, v15
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s6
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v14, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s6
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v24, v15
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v26, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v13
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v14
+; GFX10-NEXT:    v_perm_b32 v3, v23, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v11
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
+; GFX10-NEXT:    s_and_b32 s5, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-NEXT:    s_and_b32 s5, s9, s10
+; GFX10-NEXT:    v_perm_b32 v1, v21, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s5
+; GFX10-NEXT:    v_perm_b32 v0, v22, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v20, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v16bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v16.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v6.h, v14.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v14.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v5.h, v13.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v13.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v18
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v4.h, v12.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v12.h, v23.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v16.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v25.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v16, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v3.h, v11.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v16, v27
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v11.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v2.h, v10.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v16, v28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v10.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v1.h, v9.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v16, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v16, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v7.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v7, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v6, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v5.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v5, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v4, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v3, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v2, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v1, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v17.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v18.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v32.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v28.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v32, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v29.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v9.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v0.h, v8.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v16.l, v15.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v24.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v15.l, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v25, v25
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v23.h, v16.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v14.l, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v23.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v13.l, v5.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v23
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v23.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v12.l, v4.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v17
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v23.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v11.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v18
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v23.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v10.l, v2.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v23.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v9.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v20
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v23.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v8.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v23.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v15
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v16bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v13
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v16 :: v_dual_and_b32 v18, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v21, v22
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v22, v21, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v21, v20, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v23, v22, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v18
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v24, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v21, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v25, v24, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v22, v21 :: v_dual_lshlrev_b32 v25, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v19, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v24, v23, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v29, v28, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v20, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v24, v25, s3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v27, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v29, v28, s3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v22, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v25 :: v_dual_lshlrev_b32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v24, v26, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v29, v28, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v28, v23 :: v_dual_lshlrev_b32 v29, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v29
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v28, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v25, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v23, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s1
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v27, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v26, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, v13, v5, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v25, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s3
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v17, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v18, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v21, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v14, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v26, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v19, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s1
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v20, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v16bf16:
@@ -7377,355 +6915,299 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v16, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v23.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v15
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v16.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v6.h, v14.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v14.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v18, v18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v17
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v5.h, v13.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v13.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v19, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v18
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v4.h, v12.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v12.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v25.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v16, v26
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v16, v27
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v16, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v3.h, v11.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v11.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v20
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v2.h, v10.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v10.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v1.h, v9.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v9.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v0.h, v8.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v16, v29
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v16, v30
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v7.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v7, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v6.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v6, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v5, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v16.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v4, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v3.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v3, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v16.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v2, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v14
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v1, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v17.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v18.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v32.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v28.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v32, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v29.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v16.l, v15.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v24.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v15.l, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v25, v25
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v23
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v23.h, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v14.l, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v23
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v16, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v23.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v13.l, v5.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v23
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v23.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v12.l, v4.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v17
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v23.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v11.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v18
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v23.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v10.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v23.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v9.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v20
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v21
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v23.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v8.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v22
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v23.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v15
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v16bf16:
@@ -7735,403 +7217,361 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v13
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v16 :: v_dual_and_b32 v18, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v21, v22
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v22, v21, s1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v21, v20, s0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v23, v22, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v18
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v24, v25
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v21, s1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v25, v24, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v22, v21 :: v_dual_lshlrev_b32 v25, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v19, v25
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v24, v23, s1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v29, v28, s1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v25
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v20, v26
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v24, v25, s3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v27, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v29, v28, s3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v26, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v22, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v25 :: v_dual_lshlrev_b32 v25, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v24, v26, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v29, v28, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v28, v23 :: v_dual_lshlrev_b32 v29, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v28, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v26
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v25, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v23, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v26, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v27, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v26, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v13, v13, v5, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v25, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s3
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s2
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v17, v5, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v18, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v21, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v14, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v26, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v19, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s1
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v20, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y)
   ret <16 x bfloat> %result
@@ -9507,2592 +8947,2216 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v50, off, s32
 ; GFX950-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
+; GFX950-NEXT:    v_cmp_u_f32_e64 s[0:1], v48, v48
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v52, v51, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v31
+; GFX950-NEXT:    v_cndmask_b32_e64 v38, v38, v35, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX950-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v39
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX950-NEXT:    v_cmp_gt_f32_e64 s[2:3], v39, v48
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v31
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v35
+; GFX950-NEXT:    v_cndmask_b32_e64 v32, v32, v31, s[2:3]
+; GFX950-NEXT:    v_cmp_gt_f32_e64 s[2:3], v36, v49
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX950-NEXT:    v_cndmask_b32_e64 v36, v38, v35, s[2:3]
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v32
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[2:3], 0, v38
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v39
+; GFX950-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX950-NEXT:    s_and_b64 vcc, s[4:5], s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v36, v35, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v22
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v31
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
 ; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v21
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
 ; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v20
 ; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v19
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
 ; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
 ; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
 ; GFX950-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
 ; GFX950-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v50
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v50
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v33, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v32
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v50
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v37
-; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v33, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX950-NEXT:    v_cmp_gt_f32_e64 s[0:1], v34, v36
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v33
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX950-NEXT:    v_cndmask_b32_e64 v34, v35, v33, s[0:1]
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
+; GFX950-NEXT:    v_cmp_eq_f32_e64 s[0:1], 0, v35
+; GFX950-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v37
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v51, v37, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v34, v36
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v37, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v37, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v35
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
-; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v36
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v37
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
-; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v38
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v39
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v51
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v49, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v48
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v49, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v51
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
-; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v51, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v49
 ; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v49, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v51, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v49, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v20
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v51
 ; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
-; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v19
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v52
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v55
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
-; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v54, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v53
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v55, v40
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v54, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
-; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v17
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v55, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v54
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v54, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v40, v55, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
-; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v54, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v55, v54, vcc
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v40, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v55
 ; GFX950-NEXT:    v_cndmask_b32_e32 v40, v40, v55, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
 ; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v41, v40, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v40
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v40, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v40, v40, v55, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v40, v55, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v15
 ; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v15, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v40
 ; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v40, v50, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v40
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v50, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
-; GFX950-NEXT:    v_perm_b32 v15, v32, v15, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v14
 ; GFX950-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v50
 ; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v50, v30, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v30
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v50
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX950-NEXT:    v_perm_b32 v14, v31, v14, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v13
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v13
 ; GFX950-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v30
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v29
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
-; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX950-NEXT:    v_perm_b32 v13, v33, v13, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v12
 ; GFX950-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v30, v29
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v28
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX950-NEXT:    v_perm_b32 v12, v34, v12, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v11
 ; GFX950-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v29, v28
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v27
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX950-NEXT:    v_perm_b32 v11, v35, v11, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v10
 ; GFX950-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v28, v27
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v26
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
+; GFX950-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX950-NEXT:    v_perm_b32 v10, v36, v10, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v9
 ; GFX950-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v27, v26
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v25
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX950-NEXT:    v_perm_b32 v9, v37, v9, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v8
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v25
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX950-NEXT:    v_perm_b32 v8, v38, v8, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v7
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX950-NEXT:    v_perm_b32 v7, v39, v7, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v6
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v23
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX950-NEXT:    v_perm_b32 v6, v48, v6, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v5
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v22
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX950-NEXT:    v_perm_b32 v5, v49, v5, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v4
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v21
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX950-NEXT:    v_perm_b32 v4, v51, v4, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v3
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v20
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX950-NEXT:    v_perm_b32 v3, v52, v3, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v2
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v19
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX950-NEXT:    v_perm_b32 v2, v53, v2, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v18
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX950-NEXT:    v_perm_b32 v1, v54, v1, s0
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v17
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v55, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v54, v1, s0
+; GFX950-NEXT:    v_perm_b32 v2, v53, v2, s0
+; GFX950-NEXT:    v_perm_b32 v3, v52, v3, s0
+; GFX950-NEXT:    v_perm_b32 v4, v51, v4, s0
+; GFX950-NEXT:    v_perm_b32 v5, v49, v5, s0
+; GFX950-NEXT:    v_perm_b32 v6, v48, v6, s0
+; GFX950-NEXT:    v_perm_b32 v7, v39, v7, s0
+; GFX950-NEXT:    v_perm_b32 v8, v38, v8, s0
+; GFX950-NEXT:    v_perm_b32 v9, v37, v9, s0
+; GFX950-NEXT:    v_perm_b32 v10, v36, v10, s0
+; GFX950-NEXT:    v_perm_b32 v11, v35, v11, s0
+; GFX950-NEXT:    v_perm_b32 v12, v34, v12, s0
+; GFX950-NEXT:    v_perm_b32 v13, v32, v13, s0
+; GFX950-NEXT:    v_perm_b32 v14, v31, v14, s0
+; GFX950-NEXT:    v_perm_b32 v15, v33, v15, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v11
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v12
 ; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, v32, v33, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v28
 ; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v39, v49, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v23
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v35, v32, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v33
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v33, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v70, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v38, v34, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
-; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX10-NEXT:    v_cndmask_b32_e32 v35, v39, v33, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v31, v38
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v26
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v53, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v35
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v39, v48
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v48
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v37, v32, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s21, 0, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v87, 16, v27
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v35, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v49, v31, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v31
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v53, v52, s6
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v36, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v9
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v35, v35
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v33
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v49, v50
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v52, v38, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v39, v39
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v50, v49, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v49, v39, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v52, v52
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v54, v53, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v52, v33, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v36, v36
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v38, v31, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v49, v37, s6
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v8
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v36, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v37, v35, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v49, v49
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v54, v52, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v51, v55
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v52, v36, s7
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v23
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s9, v51, v54
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v65, v64, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v64, v37, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v64
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v67, v65, s7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v49
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v49, s7
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v55, v55
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s8, v31, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v65, v64, s7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v64, v55, s7
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v65, v65
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v67, v66, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v52
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v64, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v65
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s9, v54, v67
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v66, v65, s7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v65, v49, s7
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v68, v69
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v54, v54
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v69, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v67, v66, s10
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v52, v54, s10
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s8, v55, v67
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v49
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v68, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v70, v69, s10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v67, v67
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v66, v54, s10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v70, v69, s10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v71, v71
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v69, v68, s10
-; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v3
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s11, v70, v81
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v67
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v69, v69
-; GFX10-NEXT:    v_and_b32_e32 v70, 0xffff0000, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v52
+; GFX10-NEXT:    v_cndmask_b32_e64 v70, v54, v51, s10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v80, v80
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v70
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v69, v52, s10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v54, v54
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s11, v68, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v69
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v80, v71, s10
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s10, v55, v67
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s12, v82, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v71, s10
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s10, v31, v51
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v19
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v67, v67
 ; GFX10-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v69
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v71, v69, s13
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v70, v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v71, v54, s13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v68, v68
 ; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v81, v80, s13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v81, v80, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v82, v82
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v80, v70, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v80, v68, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v71, v71
 ; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v82, v82, v81, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v71, v71
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s13, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v70
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s13, v55, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v68
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v80
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s22, 0, v82
 ; GFX10-NEXT:    v_cndmask_b32_e64 v71, v81, v82, s14
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s14, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v82
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v67, v54, s13
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s14, v55, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v82
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v71
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s15, v31, v81
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s15, v55, v81
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v16
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v55, v55
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v83, v83, v81, s16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v81, v83, s16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v81
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s16, v31, v84
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v31, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v84, s17
-; GFX10-NEXT:    v_and_b32_e32 v85, 0xffff0000, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v85, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v84, v31, s17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v55, v55
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s23, 0, v83
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v81, v83, s16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v55
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s16, v81, v84
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v55, v83, s16
+; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v84, v14, v30, s17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v84
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s17, v85, v86
-; GFX10-NEXT:    v_lshrrev_b32_e32 v86, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v84, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v31, v84, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v85
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s17, 0, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v37, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v84, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v37
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v32, v37, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v36, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v34
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v37, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v36
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v34, v36, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v35, v33, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v30, v84, s17
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s24, 0, v84
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v81
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s17, v86, v85
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v85, v85
+; GFX10-NEXT:    v_cndmask_b32_e64 v85, v13, v29, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v85
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v13, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v29, v85, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v29
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s18, v86, v13
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v86, 16, v30
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v13, v13
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v86, s19
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v86, v14, s19
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v13
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s19, v30, v86
+; GFX10-NEXT:    v_and_b32_e32 v86, 0xffff0000, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s19, 0, v30
+; GFX10-NEXT:    s_and_b32 s19, s19, s20
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0, v32
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v13, v14, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v48, v34, s5
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s19, 0, v34
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v53, v35, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v64, v37, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v65, v49, s10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v70, v51, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v70, v71, v82, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v71, v81, v84, s17
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v39, v32, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v50, v33, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v66, v36, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v69, v52, s12
+; GFX10-NEXT:    s_and_b32 s4, s5, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v68, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v34, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v29, v85, s18
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v80
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s20
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v39
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s17, 0, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0, v35
+; GFX10-NEXT:    s_and_b32 s4, s4, s21
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v48
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v29, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s21, 0, v68
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v12, v28, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v29, v29
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v38, v31, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v53
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v12, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v32, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v33
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v38
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v35
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v33, v35, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v48, v38, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v35, v38, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v38, v48, s4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v35
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v50, v39, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v65
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v48, v39, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v39, v50, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v52, v49, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v50, v49, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v49, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v52, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v55, v53, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v64, v65, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v69
-; GFX10-NEXT:    v_cndmask_b32_e64 v36, v52, v53, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v55, v65, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s8
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v66, v54, s11
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v54
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v65, v54, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v54, v66, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v65
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s8, 0, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v67, v68, s12
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v66, v68, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0, v67
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v68, v67, s9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v66
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v51, v69, s13
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v68, v69, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v69, v51, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v70, s14
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v70
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v69, v70, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v70, v80, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v71, v82, s15
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v80, v82, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v71
-; GFX10-NEXT:    v_cndmask_b32_e64 v71, v82, v71, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v81, v83, s16
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v83, v82, v83, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v81
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v83, v81, s10
-; GFX10-NEXT:    buffer_load_dword v83, off, s[0:3], s32
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v85, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v85, v85
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v14, v30, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v87, v30, v85, s11
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v35, v38, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v50, v49, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v65, v54, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v84, v32, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v37
-; GFX10-NEXT:    v_and_b32_e32 v84, 0xffff0000, v15
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v34, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v48, v39, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v68
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v69
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v55, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v37
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s8, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v64
+; GFX10-NEXT:    v_cndmask_b32_e64 v96, v11, v27, s19
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v30, v39, v33, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v65
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v48, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s10, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, v28, v81, s18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v51
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s11, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v50, v36, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s8, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s19, 0, v52
+; GFX10-NEXT:    v_cndmask_b32_e32 v28, v53, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v69
+; GFX10-NEXT:    s_and_b32 vcc_lo, s9, s5
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0, v54
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v64, v49, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s13, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v96
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s14, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v55
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s15, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v71
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s16, 0, v34
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32
+; GFX10-NEXT:    s_and_b32 s7, s16, s24
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v83
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v15, v83, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v66, v67, s9
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v86, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v50, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v83, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v39, v68, v51, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e32 v48, v69, v70, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v53, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v66
-; GFX10-NEXT:    v_cndmask_b32_e32 v65, v55, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v50, v80, v71, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v65, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v49, v53, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v65
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e32 v52, v82, v81, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v65, v54, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v85
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v87, v85, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v15, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v34
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v34
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s10, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v65, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v34, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v35, v38, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s11, s19
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v66, v52, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s12, s20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v35
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, v67, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v39, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v49, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v35, v38, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s13, s21
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v69, v68, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s14, s22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v70, v82, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s15, s23
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v55, v83, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v37
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v71, v84, s7
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v87, v87
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v51, v37, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_perm_b32 v14, v14, v36, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v49, v38, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v53, v85, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v97
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v27, v96, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v81
+; GFX10-NEXT:    s_and_b32 vcc_lo, s17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v80, v85, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX10-NEXT:    v_perm_b32 v13, v13, v27, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v97, v81, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v50
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v51, v96, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v55, v87, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v53, v28, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v65
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v54, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX10-NEXT:    v_perm_b32 v13, v14, v13, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v11
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v9
-; GFX10-NEXT:    v_perm_b32 v14, v31, v28, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v32, v12, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
-; GFX10-NEXT:    v_perm_b32 v11, v33, v11, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v51, v51
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v26
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s5
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v52
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v10
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v54, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v26, v10, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v55, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v25, v9, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v96
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v49, v81, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_perm_b32 v12, v12, v25, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v50, v96, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v53, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v22
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v51, v9, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v54, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v23
 ; GFX10-NEXT:    v_perm_b32 v10, v30, v10, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_perm_b32 v9, v34, v9, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_perm_b32 v8, v35, v8, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_perm_b32 v7, v36, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v52, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v24, v8, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v51, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s5
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v52, v26
+; GFX10-NEXT:    v_perm_b32 v9, v31, v9, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v53, v53
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX10-NEXT:    v_perm_b32 v8, v11, v8, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v29, v50, 0x5040100
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v51, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v26, v26
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v5
+; GFX10-NEXT:    v_perm_b32 v7, v28, v7, 0x5040100
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v49, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v21, v5, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v23
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX10-NEXT:    v_perm_b32 v5, v38, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX10-NEXT:    v_perm_b32 v3, v39, v3, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v26, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v19
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s7
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v51, v49
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v23, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_perm_b32 v2, v48, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v49, v51, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s6
+; GFX10-NEXT:    v_perm_b32 v5, v15, v5, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v6, v32, v6, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v38, v37, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v21, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX10-NEXT:    v_perm_b32 v4, v33, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v23, v23
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v21, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s6
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v22, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s6
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v24, v23
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v49, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v21
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v22
+; GFX10-NEXT:    v_perm_b32 v3, v34, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v19
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s5
+; GFX10-NEXT:    s_and_b32 s5, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s5
+; GFX10-NEXT:    s_and_b32 s5, s9, s10
+; GFX10-NEXT:    v_perm_b32 v1, v39, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s5
+; GFX10-NEXT:    v_perm_b32 v0, v48, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v35, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v32bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0, v96.h
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v55, off, s32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v37, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v50, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v9 :: v_dual_and_b32 v8, 0xffff0000, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v48
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v54.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v55
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v53.h, v55.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v55.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v51.h, v30.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v30.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v48.h, v29.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v29.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v37.h, v28.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v28.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v34.h, v27.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v31.h, v26.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v50
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v39.h, v25.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v50.h, v24.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v35
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v35.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v32, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v7.h, v23.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v36, v37
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0, v37.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v32.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v6.h, v22.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v36, v38
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v32.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v5.h, v21.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v4.h, v20.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v36, v39
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v36, v48
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v36, v49
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v36, v50
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v36, v51
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v36, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v36, v53
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v36, v54
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v36, v55
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v36, v64
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v36, v65
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v36, v66
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s14, v36, v67
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v82.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s15, v82, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v14.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s16, v14, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v20.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v38, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v35
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v3.h, v19.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v13.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s17, v13, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v12.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s18, v12, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v11.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s19, v11, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v10.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s20, v10, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v9.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s21, v9, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v8.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s22, v8, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s23, v7, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s24, v6, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v5.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s25, v5, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v19.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v38
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v38
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v2.h, v18.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v18.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v49
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v1.h, v17.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v17.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v64, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v52
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v0.h, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v53
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v16.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v64.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v4.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s26, v4, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.h, v53.l, v55.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.h, v64.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v55.l, v66.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v30
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v66.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v64, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v66.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.h, v51.l, v30.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v54.h, v66.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v30.l, v51.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v48
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v51.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v55, v55
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v51.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v48.l, v29.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v28
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v54.h, v51.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v29.l, v30.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v37
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v30.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v48, v48
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v30, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v30.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v37.l, v28.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v54.h, v30.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v28.l, v29.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v29.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v30, v30
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v29.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v34.l, v27.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v54.h, v29.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v27.l, v28.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v28.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v29, v29
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v28.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v31.l, v26.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v54.h, v28.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v26.l, v27.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v27.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v28, v28
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v27.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v54.h, v27.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v25.l, v26.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v50
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v26.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v27, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v26.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v54.h, v26.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v24.l, v25.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v25.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v25.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v54.h, v25.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v23.l, v7.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v24, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v54.h, v7.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v22.l, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v54.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.l, v5.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v33
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v54.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v20.l, v4.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v54.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v19.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v32
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v54.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v18.l, v2.h, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v83.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s27, v83, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0, v82.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v39.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v19.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v38.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v82, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v49.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v50.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v51.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v54.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v52.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v53.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v55.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v19, v36
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v66.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v65.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v67.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v64.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v96, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v35
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v17.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v54.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v17.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v38
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v49
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v54.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v16.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v54
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v54.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v53
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v32bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v128, v119, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v132, v131, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v130, v144, v135, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v102, 16, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v53, v53
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s11, v133, v133
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v49, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v51
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s17, v49, v133
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v65, v65
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v69, v69
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v81, v81
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s12, v145, v145
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v67, v68, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v64
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v71, v80, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v85, v85
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s10, v129, v129
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v135, v130, s12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v67
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v54, v98
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s16, v37, v132
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v80
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v83, v84, s5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v97, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v70
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s18, v53, v134
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v35, v35, v36, s15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v65, v135
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v49, v130
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v37, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v131
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v132
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v69, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v81, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v85, v135
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v97, v144
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v101, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v112, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v114, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v116, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v118, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v128, v49
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v84
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v87, v96, s6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v101, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v81
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v67, v68, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v69, v144
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v114, v114
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v96
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v87, v99, v100, s7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v113, v113
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s13, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v85
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v70, v80, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v71, v145
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v116, v116, v115, s9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v99, v103, v112, s8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v117, v117
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v147, v34, s13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v87
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v81, v84, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v83, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v102, v115, v116, s9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v116
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v85, v96, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v86, v147
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v113, v119, v118, s10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v117, v131, v128, s11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v87, v100, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v97, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v99, v112, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v101, v98
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s14, v66, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v129
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v102, v116, s15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v103, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v70
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v80
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v82
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v84
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v96
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v98
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v35
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v102
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v39
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v34
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v67
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v71
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v83
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v87
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v99
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v103
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v113
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v115
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v119
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v37, v113, v118, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v115, v132
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v64
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v117, v128, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v119, v49
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v68
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v49, v129, v130, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v131, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v80
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v38, v34, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v66, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v84
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v82, v134
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v51
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v66
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v96
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v53
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v82
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v54
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s16, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v112
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s17, s1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s18, s2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v55, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s19, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s20, s4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v116
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v80, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s21, s5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v69, v84, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s22, s6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v118
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v70, v96, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s23, s7
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v71, v100, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v83
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v38
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v128
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v130
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s27, 0, v113
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s28, 0, v115
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v34
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s29, 0, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v30
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s40, 0, v119
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s41, 0, v129
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s40, s14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v67
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v33
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v64, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v65, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s24, s8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v54, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v50, v65, v32 :: v_dual_lshlrev_b32 v65, 16, v15
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s25, s9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v81, v116, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s26, s10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v118, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v67, v68
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s27, s11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v83, v128, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s28, s12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v130, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s29, s13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v114, v114
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v50, v32, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v27, v27, v11, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s41, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v30, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v36, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v28
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v26, v10 :: v_dual_lshlrev_b32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v38, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v31
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v26, v10, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v11
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v31, v29
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v51, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v28, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v8, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s1
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v28, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v52, v9, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v53, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v55, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v26, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v21, v5, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v25, v24
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s3
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v33, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v64, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v54, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v22, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v26, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v37, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v49, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v65, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v32bf16:
@@ -12102,697 +11166,579 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0, v96.h
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v55, off, s32
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v50, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v31, v10 :: v_dual_and_b32 v8, 0xffff0000, v53
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v39, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v30
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v51
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v48
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v18
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v54.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v55
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v53.h, v55.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v54.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v35
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v35.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v55.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v51.h, v30.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v29
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v37
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v30.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v48.h, v29.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v34
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v29.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v37.h, v28.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v31
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v28.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v34.h, v27.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v31.h, v26.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v50
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v39.h, v25.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v36, v37
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0, v37.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v50.h, v24.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v32, v32
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v7.h, v23.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v32.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v6.h, v22.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v32.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v5.h, v21.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v54.l
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v4.h, v20.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v20.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v38, v38
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v35
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v35
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v3.h, v19.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v19.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v38
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v38
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v2.h, v18.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v18.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v49
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v1.h, v17.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v17.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v64, v64
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v52
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v0.h, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v53
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v36, v38
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v36, v39
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v36, v48
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v36, v49
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v36, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v36, v51
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v36, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v36, v53
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v36, v54
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v36, v55
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v36, v64
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v36, v65
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v36, v66
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s14, v36, v67
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v82.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s15, v82, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v14.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s16, v14, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v13.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s17, v13, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v12.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s18, v12, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v11.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s19, v11, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v10.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s20, v10, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v9.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s21, v9, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v8.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s22, v8, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v7.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s23, v7, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v6.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s24, v6, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s25, v5, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v16.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v64.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s26, v4, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.h, v53.l, v55.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.h, v64.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v55.l, v66.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v51
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v30
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v66.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v54
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v64, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v66.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v29
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.h, v51.l, v30.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v54.h, v66.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v30.l, v51.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v48
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v51.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v55, v55
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v51.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v48.l, v29.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v28
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v54.h, v51.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v29.l, v30.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v37
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v30.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v48, v48
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v30, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v30.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v37.l, v28.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v54.h, v30.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v28.l, v29.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v29.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v30, v30
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v29.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v34.l, v27.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v54.h, v29.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v27.l, v28.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v28.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v29, v29
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v28.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v31.l, v26.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v54.h, v28.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v26.l, v27.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v27.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v28, v28
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v27.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v54.h, v27.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v25.l, v26.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v50
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v26.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v27, v27
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v26.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v54.h, v26.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v24.l, v25.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v25.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v25.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v54.h, v25.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v23.l, v7.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v24, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v54.h, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v22.l, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v54
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v54.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.l, v5.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v33
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v54.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v20.l, v4.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v18
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v54.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v19.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v32
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v17
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v54.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v18.l, v2.h, s2
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v83.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s27, v83, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0, v82.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v39.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v19.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v38.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v82, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v49.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v50.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v51.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v54.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v52.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v53.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v55.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v19, v36
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v66.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v65.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v67.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v64.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v96, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v35
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v17.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v54.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v17.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v38
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v49
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v54.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v16.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v52
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v54
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v54.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v53
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v32bf16:
@@ -12802,792 +11748,655 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v128, v119, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v132, v131, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v130, v144, v135, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v102, 16, v11
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v53, v53
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s11, v133, v133
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v49, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v52
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v51
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s17, v49, v133
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v65, v65
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v69, v69
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v81, v81
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s12, v145, v145
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v67, v68, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v64
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v71, v80, s4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v85, v85
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s10, v129, v129
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v135, v130, s12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v55
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v67
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v54, v98
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s16, v37, v132
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v80
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v83, v84, s5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v97, v97
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v70
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s18, v53, v134
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v35, v35, v36, s15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s16
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v65, v135
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v49, v130
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v37, v129
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v84
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v131
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v132
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v69, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v81, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v85, v135
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v97, v144
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v101, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v112, v146
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v114, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v116, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v118, v30
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v128, v49
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v68
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v70
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v82
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v86
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v96
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v98
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v35
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v102
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v39
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v34
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v51
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v67
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v71
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v83
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v99
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v113
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v119
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v67
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v33
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v64, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v66
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v87, v96, s6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v101, v101
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v81
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s18
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v67, v68, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v69, v144
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v114, v114
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v96
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v87, v99, v100, s7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v113, v113
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s13, v38, v38
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v85
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v70, v80, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v71, v145
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v116, v116, v115, s9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v99, v103, v112, s8
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v117, v117
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v147, v34, s13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v87
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v81, v84, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v83, v146
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v102, v115, v116, s9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v99
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v116
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v85, v96, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v86, v147
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v113, v119, v118, s10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v117, v131, v128, s11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v87, v100, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v97, v54
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v113
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v128
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v117
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v99, v112, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v101, v98
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s14, v66, v66
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v129
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v102, v116, s15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v103, v37
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v48
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v37, v113, v118, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v115, v132
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v52
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v64
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v117, v128, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v119, v49
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v68
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v49, v129, v130, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v131, v133
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v80
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v38, v34, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v66, v53
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v84
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s15, v82, v134
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v51
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v66
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v96
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v53
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v82
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v54
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s16, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v112
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v39, v48, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s17, s1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v52, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s18, s2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v55, v64, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s19, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s20, s4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v116
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v80, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s21, s5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v69, v84, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s22, s6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v118
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v70, v96, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s23, s7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v71, v100, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v83
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v38
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v128
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v130
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s27, 0, v113
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s28, 0, v115
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v34
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s29, 0, v117
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v27
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s40, 0, v119
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s41, 0, v129
+; GFX12-FAKE16-NEXT:    s_and_b32 s3, s40, s14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v65, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s24, s8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v54, v112, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v50, v65, v32 :: v_dual_lshlrev_b32 v65, 16, v15
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s25, s9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v81, v116, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s26, s10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v50
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v118, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v66
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v67, v68
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s27, s11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v83, v128, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s28, s12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v50
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v130, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s29, s13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v66
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v67
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v114, v114
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v28
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v50, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v27, v27, v11, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s41, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v30, v15, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v36, v13, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v28
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v32, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v26, v10 :: v_dual_lshlrev_b32 v31, 16, v27
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v38, v32
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v31
+; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v26, v10, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v31, v29
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v22
+; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v48, v11, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v51, v10, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v27, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v28, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v8, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v28, v26
+; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v52, v9, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v53, v8, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v55, v7, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v26, v24
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v21, v5, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v25, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v19
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s3
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s2
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v33, v5, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v64, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v54, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v22, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v26, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v37, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v19
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s5, s6
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v49, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s1
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v65, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <32 x bfloat> @llvm.maximumnum.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y)
   ret <32 x bfloat> %result
@@ -13703,17 +12512,13 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -13763,18 +12568,12 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -13796,21 +12595,17 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -13994,45 +12789,37 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -14087,55 +12874,43 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -14153,50 +12928,44 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
   ret <2 x bfloat> %result
@@ -14449,66 +13218,53 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -14578,79 +13334,61 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -14663,75 +13401,66 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -15065,82 +13794,69 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
@@ -15234,97 +13950,80 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
@@ -15337,100 +14036,91 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index ecf06f3c2f379..246fa7d41e1ef 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -121,17 +121,13 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_bf16:
@@ -181,18 +177,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_bf16:
@@ -214,21 +204,17 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -317,13 +303,10 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_bf16_nnan:
@@ -358,16 +341,11 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_bf16_nnan:
@@ -379,21 +357,17 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -581,45 +555,37 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16:
@@ -674,55 +640,43 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16:
@@ -740,50 +694,44 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
   ret <2 x bfloat> %result
@@ -917,61 +865,51 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v6 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -985,35 +923,27 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v4, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
@@ -1023,40 +953,32 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v6 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1314,66 +1236,53 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1443,79 +1352,61 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1528,75 +1419,66 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -1778,84 +1660,70 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v10, v9 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v9 :: v_dual_lshlrev_b32 v7, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -1867,49 +1735,36 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1920,57 +1775,43 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v10, v9 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v9 :: v_dual_lshlrev_b32 v7, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -2307,82 +2148,69 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -2476,97 +2304,80 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -2579,100 +2390,91 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
@@ -2910,108 +2712,94 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v6, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v9, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3024,63 +2812,47 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v6, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3090,73 +2862,61 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v9, v8
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v14, v13, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v8, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
@@ -3646,129 +3406,110 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v2.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v5.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, s0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v10.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v11.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v1, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v9.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.h, v4.h, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v4.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v0.h, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v8.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_minimumnum_v6bf16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.l, v10.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v10.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v8.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v10.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v4.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v3.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v7
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v6bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
@@ -3889,141 +3630,118 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v2.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v5.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v10.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v11.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.h, v4.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v4.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v1, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v9.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v0, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v8.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v0.h, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.l, v10.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v10.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v8.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v10.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v4.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v3.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v7
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v6bf16:
@@ -4037,145 +3755,132 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v15, v14 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v14
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v14, v10 :: v_dual_lshlrev_b32 v15, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v14
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v12, v10, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v7, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v9, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v13, v12
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v15, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v10, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4816,311 +4521,279 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v13.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v7.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v2.h, v6.h, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v6.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v2, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v1, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v1.h, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v0, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v5.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v0.h, v4.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v11.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v14.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v15.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v11.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v12.h, v11.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.l, v13.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v13.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v11.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v13.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v6.l, v2.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v11.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v11.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v4.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v11.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v8bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v12 :: v_dual_and_b32 v11, 0xffff0000, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v10 :: v_dual_lshlrev_b32 v12, 16, v9
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v12, 0xffff0000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v18, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v9 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v17, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v11, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v13
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s3, s4
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v8bf16:
@@ -5132,186 +4805,153 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v11.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v11.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v7.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v2.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v6.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v13.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v1.h, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v5.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v0.h, v4.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v2, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v1, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v0, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v11.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v14.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v15.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v11.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v11.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v12.h, v11.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.l, v13.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v13.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v12, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v11.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v13.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v6.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v11.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v11.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v4.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v10
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v11
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v11.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v8
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v8bf16:
@@ -5321,199 +4961,178 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v12 :: v_dual_and_b32 v11, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v10 :: v_dual_lshlrev_b32 v12, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v12, 0xffff0000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v14, v9, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v13
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v15, v13, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v9, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v18, v19
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v13, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v9 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v17, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v5
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v11, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v13
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s3, s4
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s1
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y)
   ret <8 x bfloat> %result
@@ -6484,920 +6103,839 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v19
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v21, v22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v20, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v22, v21, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v24, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v21, v20, s4
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v24, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v23, v22, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v17, v20, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v26, v26
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v21, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v25, v24, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v24, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v18, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v19, v25, s5
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v27, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v29, v28, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v28, v27, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v27
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v27
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v20, v23
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v19, v27, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v26, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v21, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v26
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v29, v28, s7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v28, v26, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v21, v24
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v27, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v22, v26, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v26, v23, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v29, v28, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v28, v22, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v28, v28
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
-; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v26
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v25, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v24, v22, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v28, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v26, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s5
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v27, v25
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v29, v29
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v27, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v25, v25
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v5
+; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v26, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v5, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v25, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX10-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v11
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v25, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v11
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s7
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v27, v26
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v15, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s6
+; GFX10-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX10-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v15, v15
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s6
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v14, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s6
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v24, v15
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v26, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v13
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v14
+; GFX10-NEXT:    v_perm_b32 v3, v23, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v11
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
+; GFX10-NEXT:    s_and_b32 s5, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-NEXT:    s_and_b32 s5, s9, s10
+; GFX10-NEXT:    v_perm_b32 v1, v21, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s5
+; GFX10-NEXT:    v_perm_b32 v0, v22, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v20, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v16bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v16.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v6.h, v14.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v14.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v5.h, v13.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v13.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v18
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v4.h, v12.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v12.h, v23.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v25.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v16, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v3.h, v11.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v16, v27
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v11.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v2.h, v10.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v16, v28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v10.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v1.h, v9.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v16, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v16, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v7.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v7, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v6, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v5.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v5, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v4, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v3, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v2, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v1, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v17.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v18.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v32.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v28.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v32, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v29.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v9.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v0.h, v8.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.h, v23.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v16.l, v15.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v24.h, v23.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v15.l, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v25, v25
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v23.h, v16.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v14.l, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v23.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v13.l, v5.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v23
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v23.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v12.l, v4.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v17
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v23.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v11.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v18
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v23.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v10.l, v2.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v23.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v9.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v20
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v23.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v23.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v8.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v23.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v15
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v16bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v13
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v16 :: v_dual_and_b32 v18, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v21, v22
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v22, v21, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v21, v20, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v23, v22, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v24, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v21, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v25, v24, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v22, v21 :: v_dual_lshlrev_b32 v25, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v19, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v24, v23, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v29, v28, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v20, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v24, v25, s3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v27, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v29, v28, s3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v22, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v25 :: v_dual_lshlrev_b32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v24, v26, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v29, v28, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v28, v23 :: v_dual_lshlrev_b32 v29, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v29
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v28, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v25, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v23, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s1
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v27, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v26, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, v13, v5, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v25, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s3
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v17, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v18, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v21, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v14, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v26, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v19, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s1
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v20, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v16bf16:
@@ -7407,355 +6945,299 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v16, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v23.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v15
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v16.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v6.h, v14.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v14.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v18, v18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v17
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v5.h, v13.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v13.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v19, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v18
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v18.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v4.h, v12.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v12.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v25.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v16, v26
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v16, v27
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v16, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v3.h, v11.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v11.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v20
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v2.h, v10.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v10.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v1.h, v9.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v9.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v0.h, v8.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v16, v29
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v16, v30
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v7.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v7, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v6, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v5, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v4, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v3.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v3, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v2, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.h, v23.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v14
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v1, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v17.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v18.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v32.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v28.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v32, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v29.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v16.l, v15.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v24.h, v23.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v15.l, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v25, v25
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v23
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v23.h, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v14.l, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v23
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v16, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v23.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v13.l, v5.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v23
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v23.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v12.l, v4.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v17
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v23.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v11.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v18
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v23.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v10.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v23.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v9.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v20
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v21
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v23.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v23.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v8.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v22
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v23
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v23.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v15
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v16bf16:
@@ -7765,403 +7247,361 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v13
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v16 :: v_dual_and_b32 v18, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v21, v22
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v18, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v22, v21, s1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v21, v20, s0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v23, v22, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v18
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v24, v25
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v21, s1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v25, v24, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v22, v21 :: v_dual_lshlrev_b32 v25, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v19, v25
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v24, v23, s1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v29, v28, s1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v25
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v20, v26
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v24, v25, s3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v27, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v29, v28, s3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v28, v26, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v22, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v25 :: v_dual_lshlrev_b32 v25, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v24, v26, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v22
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v29, v28, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v28, v23 :: v_dual_lshlrev_b32 v29, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v28, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v26
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v25, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v23, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v26, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v27, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v15, v7, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v14, v6, s3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v26, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v13, v13, v5, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v15
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v25, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v12, v12, v4, s3
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v11, v11, v3, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s2
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v17, v5, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v18, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v21, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v14, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v26, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v10, v10, v2, s3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v19, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v11
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s1
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v20, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y)
   ret <16 x bfloat> %result
@@ -10146,1832 +9586,1610 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_perm_b32 v10, v36, v10, s0
 ; GFX950-NEXT:    v_perm_b32 v11, v35, v11, s0
 ; GFX950-NEXT:    v_perm_b32 v12, v34, v12, s0
-; GFX950-NEXT:    v_perm_b32 v13, v33, v13, s0
+; GFX950-NEXT:    v_perm_b32 v13, v32, v13, s0
 ; GFX950-NEXT:    v_perm_b32 v14, v31, v14, s0
-; GFX950-NEXT:    v_perm_b32 v15, v32, v15, s0
+; GFX950-NEXT:    v_perm_b32 v15, v33, v15, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v11
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v12
 ; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, v32, v33, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v28
 ; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v39, v49, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v23
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v35, v32, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v33
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v33, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v70, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v38, v34, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
-; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX10-NEXT:    v_cndmask_b32_e32 v35, v39, v33, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v31, v38
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v26
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v53, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v35
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v39, v48
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v48
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v37, v32, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v87, 16, v27
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v35, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v49, v31, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v31
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v53, v52, s6
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v36, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v9
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v35, v35
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v33
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v49, v50
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v52, v38, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v39, v39
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v50, v49, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v49, v39, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v52, v52
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v54, v53, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v52, v33, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v36, v36
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v38, v31, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v49, v37, s6
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v8
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v36, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v37, v35, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v49, v49
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v54, v52, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v51, v55
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v52, v36, s7
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v23
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v6
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s9, v51, v54
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v65, v64, s7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v49
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v49, s7
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v55, v55
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s8, v31, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v65, v64, s7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v64, v55, s7
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v65, v65
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v67, v66, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v52
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v64, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v65
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s9, v54, v67
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v66, v65, s7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v64, v37, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v64
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v67, v65, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v65, v49, s7
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v68, v69
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v54, v54
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v69, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v67, v66, s10
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v52, v54, s10
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s8, v55, v67
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v49
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v68, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v70, v69, s10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v67, v67
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v66, v54, s10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v70, v69, s10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v71, v71
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v69, v68, s10
-; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v3
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s11, v70, v81
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v67
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v69, v69
-; GFX10-NEXT:    v_and_b32_e32 v70, 0xffff0000, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v52
+; GFX10-NEXT:    v_cndmask_b32_e64 v70, v54, v51, s10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v80, v80
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v70
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v69, v52, s10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v54, v54
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s11, v68, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v69
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v80, v71, s10
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s10, v55, v67
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s12, v82, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v71, s10
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s10, v31, v51
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v19
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v67, v67
 ; GFX10-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v69
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v71, v69, s13
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v70, v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v71, v54, s13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v68, v68
 ; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v81, v80, s13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v81, v80, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v82, v82
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v80, v70, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v80, v68, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v71, v71
 ; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v82, v82, v81, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v71, v71
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s13, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v70
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s13, v55, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v68
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v80
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v82
 ; GFX10-NEXT:    v_cndmask_b32_e64 v71, v81, v82, s14
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s14, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v82
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v67, v54, s13
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s14, v55, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v82
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v71
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s15, v31, v81
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s15, v55, v81
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v16
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v55, v55
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v83, v83, v81, s16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v81, v83, s16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v81
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s16, v31, v84
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v31, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v84, s17
-; GFX10-NEXT:    v_and_b32_e32 v85, 0xffff0000, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v85, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v84, v31, s17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v55, v55
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v83
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v81, v83, s16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v55
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s16, v81, v84
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v55, v83, s16
+; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v84, v14, v30, s17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v84
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s17, v85, v86
-; GFX10-NEXT:    v_lshrrev_b32_e32 v86, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v84, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v31, v84, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v85
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s17, 0, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v37, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v84, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v37
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v32, v37, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v36, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v34
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v37, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v36
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v34, v36, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v35, v33, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v30, v84, s17
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v84
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v81
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s17, v86, v85
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v85, v85
+; GFX10-NEXT:    v_cndmask_b32_e64 v85, v13, v29, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v85
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v13, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v29, v85, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v29
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s18, v86, v13
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v86, 16, v30
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v13, v13
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v86, s19
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v86, v14, s19
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v13
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s19, v30, v86
+; GFX10-NEXT:    v_and_b32_e32 v86, 0xffff0000, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s19, 0, v30
+; GFX10-NEXT:    s_and_b32 s19, s19, s20
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v32
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v13, v14, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v48, v34, s5
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v34
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v53, v35, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v64, v37, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v65, v49, s10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v70, v51, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v70, v71, v82, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v71, v81, v84, s17
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v39, v32, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v50, v33, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v66, v36, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v69, v52, s12
+; GFX10-NEXT:    s_and_b32 s4, s5, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v68, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v34, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v29, v85, s18
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v80
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s20
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v39
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s17, 0, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v35
+; GFX10-NEXT:    s_and_b32 s4, s4, s21
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v48
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v29, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v68
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v12, v28, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v29, v29
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v38, v31, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v53
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v12, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v32, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v38
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v33, v35, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v48, v38, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v35, v38, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v38, v48, s4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v35
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v50, v39, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v65
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v48, v39, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v39, v50, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v52, v49, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v50, v49, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v49, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v52, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v55, v53, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v64, v65, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v69
-; GFX10-NEXT:    v_cndmask_b32_e64 v36, v52, v53, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v55, v65, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s8
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v66, v54, s11
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v54
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v65, v54, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v54, v66, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v65
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s8, 0, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v67, v68, s12
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v66, v68, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v67
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v68, v67, s9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v66
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v51, v69, s13
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v68, v69, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v69, v51, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v70, s14
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v70
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v69, v70, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v70, v80, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v71, v82, s15
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v80, v82, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v71
-; GFX10-NEXT:    v_cndmask_b32_e64 v71, v82, v71, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v81, v83, s16
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v83, v82, v83, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v81
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v83, v81, s10
-; GFX10-NEXT:    buffer_load_dword v83, off, s[0:3], s32
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v85, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v85, v85
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v14, v30, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v87, v30, v85, s11
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v35, v38, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v50, v49, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v65, v54, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v84, v32, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v37
-; GFX10-NEXT:    v_and_b32_e32 v84, 0xffff0000, v15
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v34, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v48, v39, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v68
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v69
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v55, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v37
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s8, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v64
+; GFX10-NEXT:    v_cndmask_b32_e64 v96, v11, v27, s19
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v30, v39, v33, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v65
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v48, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s10, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, v28, v81, s18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
+; GFX10-NEXT:    s_and_b32 vcc_lo, s7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v51
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s11, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v50, v36, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s8, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v52
+; GFX10-NEXT:    v_cndmask_b32_e32 v28, v53, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v69
+; GFX10-NEXT:    s_and_b32 vcc_lo, s9, s5
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v54
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v64, v49, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s13, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v96
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s14, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v55
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s15, 0, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v71
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s16, 0, v34
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32
+; GFX10-NEXT:    s_and_b32 s7, s16, s24
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v83
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v15, v83, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v66, v67, s9
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v86, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v50, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v83, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v39, v68, v51, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e32 v48, v69, v70, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v53, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v66
-; GFX10-NEXT:    v_cndmask_b32_e32 v65, v55, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v50, v80, v71, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v65, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v49, v53, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v65
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e32 v52, v82, v81, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v65, v54, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v85
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v87, v85, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v15, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v34
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v34
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s10, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v65, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v34, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v35, v38, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s11, s19
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v66, v52, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s12, s20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v35
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, v67, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v39, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v49, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v35, v38, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s13, s21
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v69, v68, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s14, s22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v70, v82, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s15, s23
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v55, v83, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v37
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v71, v84, s7
+; GFX10-NEXT:    s_and_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v87, v87
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v51, v37, vcc_lo
+; GFX10-NEXT:    s_and_b32 vcc_lo, s6, s5
+; GFX10-NEXT:    v_perm_b32 v14, v14, v36, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v49, v38, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v53, v85, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v97
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v27, v96, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v81
+; GFX10-NEXT:    s_and_b32 vcc_lo, s17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v80, v85, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX10-NEXT:    v_perm_b32 v13, v13, v27, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v97, v81, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v50
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v51, v96, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v55, v87, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v53, v28, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v65
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v54, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX10-NEXT:    v_perm_b32 v13, v14, v13, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v11
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v9
-; GFX10-NEXT:    v_perm_b32 v14, v31, v28, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v32, v12, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
-; GFX10-NEXT:    v_perm_b32 v11, v33, v11, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v51, v51
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v26
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s5
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v52
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v10
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v54, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v26, v10, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v55, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v25, v9, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v96
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v49, v81, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_perm_b32 v12, v12, v25, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v50, v96, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v53, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v22
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v51, v9, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v54, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v23
 ; GFX10-NEXT:    v_perm_b32 v10, v30, v10, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_perm_b32 v9, v34, v9, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_perm_b32 v8, v35, v8, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX10-NEXT:    v_perm_b32 v7, v36, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v52, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v24, v8, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v51, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s5
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v52, v26
+; GFX10-NEXT:    v_perm_b32 v9, v31, v9, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v53, v53
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX10-NEXT:    v_perm_b32 v8, v11, v8, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v29, v50, 0x5040100
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v51, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v26, v26
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v5
+; GFX10-NEXT:    v_perm_b32 v7, v28, v7, 0x5040100
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v49, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v21, v5, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v23
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX10-NEXT:    v_perm_b32 v5, v38, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX10-NEXT:    v_perm_b32 v3, v39, v3, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v26, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v19
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s7
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v51, v49
+; GFX10-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v23, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_perm_b32 v2, v48, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v49, v51, 0x5040100
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0x8000, v96.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s6
+; GFX10-NEXT:    v_perm_b32 v5, v15, v5, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v6, v32, v6, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v38, v37, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v21, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX10-NEXT:    v_perm_b32 v4, v33, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v23, v23
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s6
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v21, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s6
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v22, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s6
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v24, v23
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v49, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v21
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v22
+; GFX10-NEXT:    v_perm_b32 v3, v34, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v19
+; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s5
+; GFX10-NEXT:    s_and_b32 s5, s7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s5
+; GFX10-NEXT:    s_and_b32 s5, s9, s10
+; GFX10-NEXT:    v_perm_b32 v1, v39, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s5
+; GFX10-NEXT:    v_perm_b32 v0, v48, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v35, v2, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v55, off, s32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v37, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v50, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v9 :: v_dual_and_b32 v8, 0xffff0000, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v48
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v54.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v55
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v53.h, v55.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v55.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v51.h, v30.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v30.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v48.h, v29.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v29.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v37.h, v28.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v28.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v34.h, v27.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v31.h, v26.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v50
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v39.h, v25.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v50.h, v24.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v35
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v35.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v32, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v7.h, v23.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v36, v37
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0x8000, v37.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v32.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v6.h, v22.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v36, v38
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v32.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v5.h, v21.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v4.h, v20.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v36, v39
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v36, v48
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v36, v49
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v36, v50
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v36, v51
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v36, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v36, v53
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v36, v54
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v36, v55
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v36, v64
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v36, v65
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v36, v66
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s14, v36, v67
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v82.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s15, v82, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v14.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s16, v14, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v20.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v38, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v35
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v3.h, v19.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v13.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s17, v13, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v12.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s18, v12, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v11.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s19, v11, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v10.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s20, v10, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v9.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s21, v9, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v8.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s22, v8, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s23, v7, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s24, v6, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v5.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s25, v5, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v19.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v38
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v38
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v2.h, v18.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v18.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v49
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v1.h, v17.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v17.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v64, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v52
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v0.h, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v53
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v16.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v64.h, v54.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v4.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s26, v4, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.h, v53.l, v55.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.h, v64.h, v54.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v55.l, v66.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v30
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v66.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v64, v64
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v66.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.h, v51.l, v30.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v54.h, v66.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v30.l, v51.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v48
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v51.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v55, v55
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v51.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v48.l, v29.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v28
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v54.h, v51.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v29.l, v30.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v37
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v30.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v48, v48
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v30, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v30.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v37.l, v28.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v54.h, v30.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v28.l, v29.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v29.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v30, v30
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v29.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v34.l, v27.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v54.h, v29.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v27.l, v28.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v28.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v29, v29
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v28.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v31.l, v26.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v54.h, v28.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v26.l, v27.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v27.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v28, v28
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v27.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v54.h, v27.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v25.l, v26.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v50
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v26.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v27, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v26.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v54.h, v26.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v24.l, v25.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v25.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v25.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v54.h, v25.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v23.l, v7.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v24, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v54.h, v7.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v22.l, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v54.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.l, v5.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v33
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v54.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v20.l, v4.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v54.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v19.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v32
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v54.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v18.l, v2.h, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v83.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s27, v83, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0x8000, v82.h
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v39.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v19.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v38.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v82, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v50.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v51.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v54.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v52.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v53.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v55.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v19, v36
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v66.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v65.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v67.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v64.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v96, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v35
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v17.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v54.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v17.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v38
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v49
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v54.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v54.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v16.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v54
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v54
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v54.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v53
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v32bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v128, v119, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v132, v131, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v130, v144, v135, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v102, 16, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v53, v53
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s11, v133, v133
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v49, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v51
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s17, v49, v133
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v65, v65
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v69, v69
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v81, v81
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s12, v145, v145
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v67, v68, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v64
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v71, v80, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v85, v85
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s10, v129, v129
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v135, v130, s12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v67
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v54, v98
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s16, v37, v132
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v80
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v83, v84, s5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v97, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v70
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s18, v53, v134
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v35, v35, v36, s15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v65, v135
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v49, v130
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v37, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v131
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v132
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v69, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v81, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v85, v135
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v97, v144
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v101, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v112, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v114, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v116, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v118, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v128, v49
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v84
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v87, v96, s6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v101, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v81
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v67, v68, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v69, v144
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v114, v114
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v96
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v87, v99, v100, s7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v113, v113
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s13, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v85
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v70, v80, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v71, v145
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v116, v116, v115, s9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v99, v103, v112, s8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v117, v117
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v147, v34, s13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v87
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v81, v84, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v83, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v102, v115, v116, s9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v116
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v85, v96, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v86, v147
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v113, v119, v118, s10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v117, v131, v128, s11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v87, v100, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v97, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v99, v112, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v101, v98
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s14, v66, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v129
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v102, v116, s15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v103, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v96
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v98
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v102
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v115
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v119
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v37, v113, v118, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v115, v132
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v64
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v117, v128, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v119, v49
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v68
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v49, v129, v130, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v131, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v80
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v38, v34, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v66, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v84
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v82, v134
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v51
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v66
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v96
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v53
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v82
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v54
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s16, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v112
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s17, s1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s18, s2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v55, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s19, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s20, s4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v116
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v80, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s21, s5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v69, v84, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s22, s6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v118
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v70, v96, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s23, s7
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v71, v100, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v83
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v38
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v128
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v130
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s27, 0, v113
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s28, 0, v115
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s29, 0, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v30
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s40, 0, v119
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s41, 0, v129
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s40, s14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v67
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v64, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v65, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s24, s8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v54, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v50, v65, v32 :: v_dual_lshlrev_b32 v65, 16, v15
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s25, s9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v81, v116, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s26, s10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v118, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v67, v68
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s27, s11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v83, v128, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s28, s12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v130, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s29, s13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v114, v114
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v50, v32, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v27, v27, v11, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s41, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v30, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v36, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v28
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v26, v10 :: v_dual_lshlrev_b32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v38, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v31
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v26, v10, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v11
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v31, v29
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v51, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v28, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v8, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s1
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v28, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v52, v9, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v53, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v55, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v26, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v21, v5, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v25, v24
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s3
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v33, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v64, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v54, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v22, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v26, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v37, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s5, s6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v49, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v65, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v32bf16:
@@ -11981,697 +11199,579 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0x8000, v96.h
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v55, off, s32
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v50, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v31, v10 :: v_dual_and_b32 v8, 0xffff0000, v53
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v39, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v30
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v51
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v48
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v18
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v54.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v55
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v53.h, v55.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v54.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v35
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v35.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v55.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v51.h, v30.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v29
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v37
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v30.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v48.h, v29.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v34
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v29.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v37.h, v28.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v31
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v28.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v34.h, v27.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v31.h, v26.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v50
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v39.h, v25.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v36, v37
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0x8000, v37.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v50.h, v24.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v32, v32
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v7.h, v23.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v32.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v6.h, v22.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v32.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v5.h, v21.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v35, v35
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v54.l
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v32
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v32
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v4.h, v20.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v20.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v38, v38
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v35
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v35
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v35.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v3.h, v19.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v19.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v38
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v38
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v38.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v2.h, v18.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v18.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v49
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v49.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v1.h, v17.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v17.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v64, v64
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v52
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v52.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v0.h, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v53
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v36, v38
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v36, v39
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v36, v48
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v36, v49
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v36, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v36, v51
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v36, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v36, v53
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v36, v54
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v36, v55
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v36, v64
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v36, v65
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v36, v66
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s14, v36, v67
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v82.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s15, v82, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v14.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s16, v14, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v13.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s17, v13, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v12.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s18, v12, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v11.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s19, v11, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v10.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s20, v10, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v9.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s21, v9, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v8.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s22, v8, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v7.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s23, v7, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s24, v6, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s25, v5, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v16.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v64.h, v54.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s26, v4, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.h, v53.l, v55.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.h, v64.h, v54.h, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v55.l, v66.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v51
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v30
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v66.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v54
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v64, v64
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v66.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v29
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.h, v51.l, v30.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v54.h, v66.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v30.l, v51.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v48
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v51.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v55, v55
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v51.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v48.l, v29.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v28
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v54.h, v51.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v29.l, v30.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v37
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v30.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v48, v48
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v30, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v30.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v37.l, v28.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v54.h, v30.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v28.l, v29.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v29.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v30, v30
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v29.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v34.l, v27.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v54.h, v29.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v27.l, v28.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v28.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v29, v29
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v28.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v31.l, v26.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v54.h, v28.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v26.l, v27.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v27.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v28, v28
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v27.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v54.h, v27.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v25.l, v26.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v50
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v26.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v27, v27
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v26.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v54.h, v26.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v24.l, v25.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v25.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v54
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v26, v26
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v25.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v54.h, v25.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v23.l, v7.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v24, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v54.h, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v22.l, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v54
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v54.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.l, v5.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v33
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v54.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v20.l, v4.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v18
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v54.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v19.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v32
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v17
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v54.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v18.l, v2.h, s2
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v83.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s27, v83, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0x8000, v82.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v39.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v19.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v38.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v82, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v50.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v51.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v54.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v52.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v53.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v55.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v19, v36
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v66.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v65.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v67.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v64.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v96, v36
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v35
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v17.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v54.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v17.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v38
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v49
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v54.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v54.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v16.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v52
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v54
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v54.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v54
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v54.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v53
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v32bf16:
@@ -12681,792 +11781,655 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v128, v119, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v132, v131, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v130, v144, v135, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v102, 16, v11
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v53, v53
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v82, v82
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s15, v98, v98
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s11, v133, v133
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v49, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v52
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v28, v28, v12, s15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v51
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s17, v49, v133
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v65, v65
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s2
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v69, v69
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v81, v81
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s12, v145, v145
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v39
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v67, v68, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v64
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v71, v80, s4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v85, v85
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s10, v129, v129
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v135, v130, s12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v55
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v67
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v54, v98
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s16, v37, v132
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v80
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v83, v84, s5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v97, v97
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v70
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s18, v53, v134
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v35, v35, v36, s15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s16
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v65, v135
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v49, v130
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v37, v129
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v84
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v131
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v132
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v69, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v81, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v85, v135
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v97, v144
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v101, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v112, v146
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v114, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v116, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v118, v30
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v128, v49
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v86
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v96
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v98
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v102
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v119
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v67
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v64, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v87, v96, s6
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v101, v101
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v81
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v55, v55, v64, s18
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v67, v68, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v69, v144
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v114, v114
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v96
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v87, v99, v100, s7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v113, v113
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s13, v38, v38
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v85
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v70, v80, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v71, v145
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v116, v116, v115, s9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v99, v103, v112, s8
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s9, v117, v117
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v147, v34, s13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v87
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v81, v84, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v83, v146
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v102, v115, v116, s9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v99
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v116
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v85, v96, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v86, v147
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v102
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v113, v119, v118, s10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v117, v131, v128, s11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v87, v100, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v97, v54
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v113
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v128
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v117
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v130
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v99, v112, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v101, v98
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s14, v66, v66
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v129
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v38
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v102, v116, s15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v118
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v29
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v103, v37
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v55
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v48
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v37, v113, v118, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v115, v132
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v65
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v52
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v67
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v64
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v117, v128, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v119, v49
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v85
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v69
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v68
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v86
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v49, v129, v130, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v131, v133
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v70
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v80
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v87
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v71
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v38, v38, v34, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v66, v53
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v84
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v97
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v30, v30, v14, s15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s15, v82, v134
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v51
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v66
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v96
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v98
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v29, v29, v13, s15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v53
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v82
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v99
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v54
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s16, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v112
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v39, v48, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s17, s1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v101
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v52, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s18, s2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v81
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v55, v64, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s19, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v37
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s20, s4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v116
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v80, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s21, s5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v102
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v69, v84, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s22, s6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v118
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v70, v96, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s23, s7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v103
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v71, v100, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v83
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v38
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v128
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v130
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s27, 0, v113
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s28, 0, v115
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v34
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s29, 0, v117
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v27
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s40, 0, v119
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s41, 0, v129
+; GFX12-FAKE16-NEXT:    s_and_b32 s3, s40, s14
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v65, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s24, s8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v54, v112, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v32
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v50, v65, v32 :: v_dual_lshlrev_b32 v65, 16, v15
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s25, s9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v81, v116, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s26, s10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v50
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v118, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v66
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v67, v68
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s27, s11
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v83, v128, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s28, s12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v50
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v130, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s29, s13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v66
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v67
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v114, v114
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v28
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v50, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v27, v27, v11, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s41, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v30, v15, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v36, v13, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v28
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v32, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v26, v10 :: v_dual_lshlrev_b32 v31, 16, v27
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v38, v32
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v31
+; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v26, v26, v10, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v31, v29
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v22
+; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v48, v11, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v25, v25, v9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v51, v10, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v27, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v28, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v24, v24, v8, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v27, v27
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v28, v26
+; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v52, v9, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v23, v23, v7, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v53, v8, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v22, v22, v6, s3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v25, v25
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v55, v7, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v26, v24
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v21, v21, v5, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v23, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v20
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v25, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v19
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v20, v20, v4, s3
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v27, v26
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v19, v19, v3, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s2
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v33, v5, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v64, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v54, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v23, v23
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v22, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v17, v17, v1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v26, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v18, v18, v2, s3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v37, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v19
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, s5, s6
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v49, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s1
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v65, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <32 x bfloat> @llvm.minimumnum.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y)
   ret <32 x bfloat> %result
@@ -13584,17 +12547,13 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -13644,18 +12603,12 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -13677,21 +12630,17 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -13878,45 +12827,37 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -13971,55 +12912,43 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v4.h, s2
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v4.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -14037,50 +12966,44 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
   ret <2 x bfloat> %result
@@ -14336,66 +13259,53 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -14465,79 +13375,61 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -14550,75 +13442,66 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-FAKE16-NEXT:    s_and_b32 s0, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -14955,82 +13838,69 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
@@ -15124,97 +13994,80 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v7.h, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.h, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v4
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
@@ -15227,100 +14080,91 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v13, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v9
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v5
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s1, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)



More information about the llvm-commits mailing list