[llvm] [SDAG] Share signed zero handling for maximum and maximumnum (PR #142762)

Nikita Popov via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 4 03:12:46 PDT 2025


https://github.com/nikic created https://github.com/llvm/llvm-project/pull/142762

Use the same code to handle signed zero ordering for maximum and maximumnum legalization.

For maximumnum, this reduces the number of comparisons and fixes legalization for the case where the same-sized integer type is not legal.

>From 32813e760e4cef7a88b0cf3fd409b6f8388f4308 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Mon, 2 Jun 2025 12:34:46 +0200
Subject: [PATCH] [SDAG] Share signed zero handling for maximum and maximumnum

Use the same code to handle signed zero ordering for
maximum and maximumnum legalization.

For maximumnum, this reduces the number of comparisons and fixes
legalization for the case where the same-sized integer type is
not legal.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |    45 +-
 llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll  |   745 +-
 llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll  |   834 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll   | 17251 +++++++--------
 llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll   | 17958 +++++++---------
 .../CodeGen/Mips/fp-maximumnum-minimumnum.ll  |    90 +-
 .../CodeGen/X86/fminimumnum-fmaximumnum.ll    |   599 +-
 7 files changed, 16999 insertions(+), 20523 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f34bf0ca7ede0..cefcda79477ee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8559,6 +8559,23 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
+static SDValue emitSignedZeroOrdering(SelectionDAG &DAG, bool IsMax,
+                                      SDValue MinMax, SDValue LHS, SDValue RHS,
+                                      EVT CCVT, SDNodeFlags Flags,
+                                      const SDLoc &DL) {
+  EVT VT = MinMax.getValueType();
+  SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
+                                DAG.getConstantFP(0.0, DL, VT), ISD::SETOEQ);
+  FloatSignAsInt State;
+  DAG.getSignAsIntValue(State, DL, LHS);
+  SDValue IsSpecificZero =
+      DAG.getSetCC(DL, CCVT, State.IntValue,
+                   DAG.getConstant(0, DL, State.IntValue.getValueType()),
+                   IsMax ? ISD::SETEQ : ISD::SETNE);
+  SDValue Sel = DAG.getSelect(DL, VT, IsSpecificZero, LHS, RHS, Flags);
+  return DAG.getSelect(DL, VT, IsZero, Sel, MinMax, Flags);
+}
+
 SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
                                                 SelectionDAG &DAG) const {
   if (SDValue Expanded = expandVectorNaryOpBySplitting(N, DAG))
@@ -8609,18 +8626,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
 
   // fminimum/fmaximum requires -0.0 less than +0.0
   if (!MinMaxMustRespectOrderedZero && !N->getFlags().hasNoSignedZeros() &&
-      !DAG.isKnownNeverZeroFloat(RHS) && !DAG.isKnownNeverZeroFloat(LHS)) {
-    SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
-                                  DAG.getConstantFP(0.0, DL, VT), ISD::SETOEQ);
-    FloatSignAsInt State;
-    DAG.getSignAsIntValue(State, DL, LHS);
-    SDValue IsSpecificZero =
-        DAG.getSetCC(DL, CCVT, State.IntValue,
-                     DAG.getConstant(0, DL, State.IntValue.getValueType()),
-                     IsMax ? ISD::SETEQ : ISD::SETNE);
-    SDValue Sel = DAG.getSelect(DL, VT, IsSpecificZero, LHS, RHS, Flags);
-    MinMax = DAG.getSelect(DL, VT, IsZero, Sel, MinMax, Flags);
-  }
+      !DAG.isKnownNeverZeroFloat(RHS) && !DAG.isKnownNeverZeroFloat(LHS))
+    return emitSignedZeroOrdering(DAG, IsMax, MinMax, LHS, RHS, CCVT, Flags,
+                                  DL);
 
   return MinMax;
 }
@@ -8697,17 +8705,8 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
       DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS)) {
     return MinMax;
   }
-  SDValue TestZero =
-      DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
-  SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
-                                DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
-  SDValue LCmp = DAG.getSelect(
-      DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
-      MinMax, Flags);
-  SDValue RCmp = DAG.getSelect(
-      DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, LCmp,
-      Flags);
-  return DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
+  return emitSignedZeroOrdering(DAG, IsMax, MinMax, LHS, RHS, CCVT, Flags,
+                                DL);
 }
 
 /// Returns a true value if if this FPClassTest can be performed with an ordered
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index 8c75b5c7c027e..5b7838b3e8237 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1714,30 +1714,26 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1745,38 +1741,34 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1784,50 +1776,44 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
-; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1845,9 +1831,7 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -1862,9 +1846,7 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1886,48 +1868,43 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1935,44 +1912,39 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-FAKE16:       ; %bb.0:
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@@ -1999,9 +1971,8 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
@@ -2010,46 +1981,40 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
@@ -2063,16 +2028,16 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
@@ -2081,11 +2046,8 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
@@ -2106,14 +2068,10 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call bfloat @llvm.maximumnum.bf16(bfloat %a, bfloat %b)
@@ -2178,22 +2136,20 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2203,49 +2159,43 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2255,22 +2205,20 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX900-SDAG-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
+; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2280,48 +2228,42 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v5
+; GFX900-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2332,27 +2274,24 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX950-SDAG-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
+; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    s_nop 0
@@ -2363,17 +2302,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
@@ -2381,47 +2316,42 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v5
-; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
+; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX950-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2448,19 +2378,15 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -2483,18 +2409,14 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
@@ -2526,14 +2448,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v6, v8
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v6.l, v1.l, v0.l, s0
@@ -2542,75 +2463,67 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v3.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v1.l, s1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s2
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v1.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -2623,80 +2536,68 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v4, v3 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v3 :: v_dual_lshlrev_b32 v4, 16, v5
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v7, 16, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
@@ -2734,11 +2635,10 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v6, v8
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
@@ -2751,17 +2651,14 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v3.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -2793,14 +2690,13 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
@@ -2812,17 +2708,14 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v1.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -2842,100 +2735,86 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v4, v3 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v3 :: v_dual_lshlrev_b32 v4, 16, v5
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v7, 16, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index fd7c7006b3612..c4818956a71d4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -1713,32 +1713,27 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -1746,39 +1741,34 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
-; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -1786,50 +1776,44 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
-; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -1845,11 +1829,9 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -1862,11 +1844,9 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1888,48 +1868,43 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1937,44 +1912,39 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-FAKE16:       ; %bb.0:
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@@ -2001,57 +1971,50 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
@@ -2065,29 +2028,26 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
@@ -2106,16 +2066,12 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call bfloat @llvm.minimumnum.bf16(bfloat %a, bfloat %b)
@@ -2180,23 +2136,20 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2206,49 +2159,43 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2258,23 +2205,20 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX900-SDAG-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
-; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -2284,48 +2228,42 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v5
+; GFX900-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
+; GFX900-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2336,27 +2274,24 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX950-SDAG-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_sdwa v4, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    s_nop 0
@@ -2367,17 +2302,13 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
@@ -2385,48 +2316,42 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v5
-; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
+; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-SDAG-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX950-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2453,19 +2378,15 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -2488,18 +2409,14 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
@@ -2531,91 +2448,82 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v6, v8
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v3.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v1.l, s1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s2
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v1.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -2628,80 +2536,68 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v4, v3 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v3 :: v_dual_lshlrev_b32 v4, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v7, 16, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
@@ -2739,34 +2635,30 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v6, v8
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v3.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -2798,36 +2690,32 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v1.l, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -2847,100 +2735,86 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v3 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v4, v3 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v1, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v3 :: v_dual_lshlrev_b32 v4, 16, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v7, 16, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v1 :: v_dual_lshlrev_b32 v6, 16, v0
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v3
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index 7d9b46a10c8f1..66307ba8ba735 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -34,12 +34,10 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -48,21 +46,19 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximumnum_bf16:
@@ -70,27 +66,24 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_bf16:
@@ -108,9 +101,7 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -136,15 +127,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -166,10 +154,8 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -202,18 +188,14 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -243,11 +225,8 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@@ -275,49 +254,41 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximumnum_bf16_nnan:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximumnum_bf16_nnan:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_bf16_nnan:
@@ -329,9 +300,7 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -346,13 +315,11 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -365,12 +332,9 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -390,15 +354,12 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -417,14 +378,10 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
@@ -455,22 +412,20 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -480,12 +435,10 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -497,22 +450,20 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -522,12 +473,10 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
@@ -539,27 +488,24 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
@@ -572,14 +518,11 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
@@ -608,18 +551,14 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
@@ -651,14 +590,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
@@ -667,20 +605,16 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -694,39 +628,36 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -762,11 +693,10 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
@@ -779,17 +709,14 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -811,46 +738,41 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -885,26 +807,22 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v3, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -915,26 +833,22 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -948,32 +862,26 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -982,28 +890,24 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1014,27 +918,23 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
@@ -1046,32 +946,26 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v1, v0 :: v_dual_and_b32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1087,29 +981,26 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.h
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
@@ -1125,39 +1016,30 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v1, v0 :: v_dual_and_b32 v5, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1195,22 +1077,20 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1220,12 +1100,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -1237,12 +1115,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1254,22 +1130,20 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1279,12 +1153,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -1296,12 +1168,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
@@ -1313,27 +1183,24 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -1346,14 +1213,11 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
@@ -1369,14 +1233,11 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
@@ -1413,29 +1274,23 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1445,65 +1300,65 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1512,59 +1367,53 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16:
@@ -1576,76 +1425,70 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1658,75 +1501,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -1763,36 +1598,30 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1805,37 +1634,31 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1849,14 +1672,11 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
@@ -1864,72 +1684,60 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3bf16_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1938,86 +1746,77 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v9, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v10, v7 :: v_dual_lshlrev_b32 v2, 16, v6
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -2028,49 +1827,44 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v9, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -2080,57 +1874,45 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v10, v7 :: v_dual_lshlrev_b32 v2, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -2172,41 +1954,37 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2216,12 +1994,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2233,12 +2009,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2252,41 +2026,37 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2296,12 +2066,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2313,12 +2081,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
@@ -2331,51 +2097,46 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -2388,14 +2149,11 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -2411,14 +2169,11 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
@@ -2447,57 +2202,49 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v14
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2508,84 +2255,76 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v11
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v9, v14
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v13, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -2611,63 +2350,54 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2681,98 +2411,86 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v11
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v9, v14
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v13, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -2805,78 +2523,69 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
@@ -2918,53 +2627,45 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX8-LABEL: v_maximumnum_v4bf16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2979,50 +2680,42 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
@@ -3037,64 +2730,51 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    v_perm_b32 v1, v1, v4, s0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3103,54 +2783,46 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3159,109 +2831,93 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v7, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v8, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v5, v6
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v10, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.l, v1.h, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v7, v5 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_and_b32 v9, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v2, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v13, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v11 :: v_dual_lshlrev_b32 v2, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3274,61 +2930,50 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v7, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v8, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v5, v6
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v10, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v3.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.l, v1.h, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3338,74 +2983,61 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v7, v5 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_and_b32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v2, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v13, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v11 :: v_dual_lshlrev_b32 v2, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
   ret <4 x bfloat> %result
@@ -3460,60 +3092,54 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3523,12 +3149,10 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -3540,12 +3164,10 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
@@ -3557,12 +3179,10 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3579,60 +3199,54 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
 ; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3642,12 +3256,10 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -3659,12 +3271,10 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
@@ -3676,12 +3286,10 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v8, v0, s4
@@ -3696,76 +3304,70 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
-; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
+; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v10
-; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
 ; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX950-NEXT:    s_nop 0
@@ -3778,14 +3380,11 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
@@ -3801,14 +3400,11 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
@@ -3824,14 +3420,11 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v8, v0, s0
@@ -3847,108 +3440,96 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_sdwa v12, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v15, v14, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v14, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v10, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v14, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v8, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v13, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v8, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX10-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX10-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v2, v7, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3963,119 +3544,106 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v13, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v14, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v9.l, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v9.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v7
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v10, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v15, v14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v12.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v2.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.l, v1.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.l, v2.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v6bf16:
@@ -4085,115 +3653,104 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v8, v7 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v7, v6 :: v_dual_lshlrev_b32 v13, 16, v7
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v11 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v9, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_lshlrev_b32 v8, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v13, v7 :: v_dual_lshlrev_b32 v14, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v12, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v8, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v4, v1 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v3, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_lshlrev_b32 v10, 16, v11
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4212,136 +3769,124 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v13, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v14, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v14, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v10.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v12.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v9.l, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v9.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v7
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v10, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.l, v10.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v15, v14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v9.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v12.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v2.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.l, v1.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.l, v2.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v6bf16:
@@ -4355,146 +3900,131 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v8, v7 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v7, v6 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v11 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v14
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_lshlrev_b32 v8, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v13, v7 :: v_dual_lshlrev_b32 v14, 16, v9
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v12, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v8, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v13, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v4, v1 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v3, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_lshlrev_b32 v10, 16, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x bfloat> @llvm.maximumnum.v6bf16(<6 x bfloat> %x, <6 x bfloat> %y)
@@ -4562,79 +4092,71 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4644,12 +4166,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -4661,12 +4181,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
@@ -4678,12 +4196,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -4695,12 +4211,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -4719,79 +4233,71 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4801,12 +4307,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -4818,12 +4322,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
@@ -4835,12 +4337,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -4852,12 +4352,10 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v11, v0, s4
@@ -4873,101 +4371,92 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v11
-; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v12
-; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
+; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v13
-; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
+; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX950-NEXT:    s_nop 0
@@ -4980,14 +4469,11 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
@@ -5003,14 +4489,11 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v7
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
@@ -5026,14 +4509,11 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -5049,14 +4529,11 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v11, v0, s0
@@ -5072,7 +4549,7 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
@@ -5084,134 +4561,118 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v9
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v10, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v4, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v15, v3, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5219,171 +4680,152 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v5.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v19, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v13.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v10.l, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v15, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v13, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v11, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v19, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v10.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v12.l, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v11.l, v9.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v13.l, v12.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.l, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v9.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v10.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v15, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v7.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v6.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v5.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v4.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v2.l, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.l, v1.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v0.l, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v9.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v7
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v8bf16:
@@ -5393,7 +4835,7 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -5402,139 +4844,131 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v12, v11 :: v_dual_lshlrev_b32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v9, v8 :: v_dual_and_b32 v15, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v11, v10 :: v_dual_lshlrev_b32 v14, 16, v12
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v15
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v13, v11 :: v_dual_lshlrev_b32 v17, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v11 :: v_dual_lshlrev_b32 v14, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v7, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v5, v1 :: v_dual_lshlrev_b32 v13, 16, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v4, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v15, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5546,201 +4980,178 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v15, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v5.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v19, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v13.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v13, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v11.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v10.l, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v11, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v9.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v19, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v10.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v12.l, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v11.l, v9.l, s3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v13.l, v12.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.l, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v9.h, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v10.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v15, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v4.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v7.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v6.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v10.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v5.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v4.l, v0.l, s0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v2.l, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.l, v1.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v0.l, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v9.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v7
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v8bf16:
@@ -5754,7 +5165,7 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -5765,183 +5176,166 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v12, v11 :: v_dual_lshlrev_b32 v13, 16, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v9, v8 :: v_dual_and_b32 v15, 0xffff0000, v6
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v11, v10 :: v_dual_lshlrev_b32 v14, 16, v12
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v15
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v9
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v13, v11 :: v_dual_lshlrev_b32 v17, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v11 :: v_dual_lshlrev_b32 v14, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v7, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v16
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v5, v1 :: v_dual_lshlrev_b32 v13, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y)
@@ -6059,155 +5453,139 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6217,12 +5595,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
@@ -6234,12 +5610,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
@@ -6251,12 +5625,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
@@ -6268,12 +5640,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
@@ -6285,12 +5655,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -6302,12 +5670,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
@@ -6319,12 +5685,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -6336,12 +5700,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -6368,155 +5730,139 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
 ; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
-; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
-; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
-; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
-; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
-; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6526,12 +5872,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v7
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
@@ -6543,12 +5887,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
@@ -6560,12 +5902,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
@@ -6577,12 +5917,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
@@ -6594,12 +5932,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -6611,12 +5947,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
@@ -6628,12 +5962,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -6645,12 +5977,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v23, v0, s4
@@ -6670,198 +6000,180 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v15
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v19
-; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
-; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
+; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
 ; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v20
-; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
+; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v13
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v21
-; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
+; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v22
-; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
+; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v11
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v23
-; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
+; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v10
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v22, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v10
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v24
-; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
+; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v23, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v25
-; GFX950-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
+; GFX950-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v24, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v26
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v23, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v23, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v27
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX950-NEXT:    s_nop 0
@@ -6874,14 +6186,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
@@ -6897,14 +6206,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v15
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
@@ -6920,14 +6226,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v14
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
@@ -6943,14 +6246,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v13
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
@@ -6966,14 +6266,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
@@ -6989,14 +6286,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v11
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
@@ -7012,14 +6306,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v10
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
@@ -7035,14 +6326,11 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v23, v0, s0
@@ -7059,13 +6347,10 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
@@ -7075,277 +6360,248 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v5
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v23, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v19, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v20, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v26
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v18, v21, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v24, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v22, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v20, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v25, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v26, v23, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v23, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v23, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v25, v24, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v27, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v29
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v21, v24, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v24, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v27
 ; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v23, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v30, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v29, v25, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v26, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v27, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v26
+; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
 ; GFX10-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX10-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX10-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX10-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7353,340 +6609,309 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v15.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v17.h, v14.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v20, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v5.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v25, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v18.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v20.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v4.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v6.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v12.h, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v18.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v21.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v2.h, v10.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v10.h, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v19.l, v18.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v21.l, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v18.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v24, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v1.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v6.l, v22.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v23.l, v7.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v9.h, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v21.l, v20.l, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v22.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v26, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v27, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v23, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v0.h, v8.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v18.l, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v7.l, v19.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v24.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v21.l, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v21, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v23, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v19.l, s0
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v23, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v25, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v21.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v21.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v18.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v17.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v16.l
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v15.l, v16.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v14.l, v17.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v15.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v17.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v14.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v19.l, v6.l, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v18, v17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v13.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v13.l, v5.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v17.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v23, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v16.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v10.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v22, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s3
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v13.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v15, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v5.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v22, v17
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v24, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v15.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v9.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v8.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v9.l, v1.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v2.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v11.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.l, v0.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v15.l, v1.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v10.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v20
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v16 :: v_dual_mov_b32 v4, v19
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v18
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v16bf16:
@@ -7696,305 +6921,273 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v19
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v18, v16 :: v_dual_and_b32 v21, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v23, v22 :: v_dual_lshlrev_b32 v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v19, v20 :: v_dual_and_b32 v23, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v20 :: v_dual_lshlrev_b32 v26, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v18, v21, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v24, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v23, v18 :: v_dual_and_b32 v21, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v22, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v20, 16, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v19 :: v_dual_and_b32 v24, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v29
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v23, v21 :: v_dual_lshlrev_b32 v28, 16, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v23, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v23 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v27 :: v_dual_and_b32 v25, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v24 :: v_dual_lshlrev_b32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v30, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v23, 16, v24
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v29, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v24, v22 :: v_dual_lshlrev_b32 v23, 16, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v27 :: v_dual_lshlrev_b32 v28, 16, v6
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v27
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v14, v6 :: v_dual_lshlrev_b32 v15, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v13, v5 :: v_dual_lshlrev_b32 v25, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v12, v4 :: v_dual_lshlrev_b32 v15, 16, v14
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v14, 16, v10
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8006,405 +7199,361 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v15.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v17.h, v14.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v13
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v20, v21
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v5.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v19.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v7.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v24, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v7.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v18.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v25, v26
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v6.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v18.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v20.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v20.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v22.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v26, v21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v4.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v12.h, v18.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v18.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v20.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v21.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v2.h, v10.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v7.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v10.h, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v19.l, v18.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v22.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v21.l, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v18.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v24, v25
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v1.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v20.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v6.l, v22.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v23.l, v7.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v9.h, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v21.l, v20.l, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v24.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v21.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v22.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v27, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v23, v25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v0.h, v8.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v18.l, v20.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v7.l, v19.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v24.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v21.l, s1
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v21, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v19.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v19.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v23, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v15.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v25, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v23, v24
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v21.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v21.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v18.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v22
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v17.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v15.l, v16.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v14.l, v17.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v13.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v15.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v19.l, v6.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v17.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v14.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v13.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v17.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v13.l, v5.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v17.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v18, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v23, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v3.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v16.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v10.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v22, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s3
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v15
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v13.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v15, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v5.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v22, v17
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v24, v23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v15.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v9.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v8.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v14.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v9.l, v1.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v2.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v11.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.l, v0.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v15.l, v1.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v10.l, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v20 :: v_dual_mov_b32 v3, v16
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v19 :: v_dual_mov_b32 v5, v18
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v16bf16:
@@ -8418,18 +7567,16 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
@@ -8442,374 +7589,322 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v18
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v21, v22
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v18, v16 :: v_dual_and_b32 v21, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v23, v22 :: v_dual_lshlrev_b32 v18, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v19, v20 :: v_dual_and_b32 v23, 0xffff0000, v13
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v17, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v20 :: v_dual_lshlrev_b32 v26, 16, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v18, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v24, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v23, v18 :: v_dual_and_b32 v21, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v22, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v19, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v20, 16, v22
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v19 :: v_dual_and_b32 v24, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v29
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v23, v21 :: v_dual_lshlrev_b32 v28, 16, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v22
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v23, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v23 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v27 :: v_dual_and_b32 v25, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v29
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v21, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v24 :: v_dual_lshlrev_b32 v24, 16, v26
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v30, v29, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v23, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v29, v25, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v24, v22 :: v_dual_lshlrev_b32 v23, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v26
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v27 :: v_dual_lshlrev_b32 v28, 16, v6
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v14, v6 :: v_dual_lshlrev_b32 v15, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v13, v5 :: v_dual_lshlrev_b32 v25, 16, v11
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v12, v4 :: v_dual_lshlrev_b32 v15, 16, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y)
@@ -9085,314 +8180,283 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v38, v35, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v32, v31, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
-; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v37, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v37
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v53
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v37, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v39, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v52, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v33, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v33, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v34, v33, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v35, v37
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v51, v50, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v50, v34, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
 ; GFX8-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
 ; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v27
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
 ; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v37, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v25
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
 ; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v39, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v24
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v50
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
 ; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v48, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v23
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v51
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
 ; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v22
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
 ; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v50, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v21
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
 ; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v51, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v20
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
 ; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v52, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v19
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v40
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
 ; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v53, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v18
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
 ; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v54, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
+; GFX8-NEXT:    v_and_b32_e32 v40, 0xffff0000, v17
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
-; GFX8-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
+; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
+; GFX8-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
 ; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v40, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v40, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
+; GFX8-NEXT:    v_and_b32_e32 v41, 0xffff0000, v16
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
 ; GFX8-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
-; GFX8-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
+; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
+; GFX8-NEXT:    v_lshlrev_b32_e32 v43, 16, v40
+; GFX8-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v42, v43
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v41, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -9402,12 +8466,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v40
 ; GFX8-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
@@ -9419,12 +8481,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v55
 ; GFX8-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
@@ -9436,12 +8496,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v55, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
@@ -9453,12 +8511,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v30, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
@@ -9470,12 +8526,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v29, v28
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
@@ -9487,12 +8541,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v28, v27
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
@@ -9504,12 +8556,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v27, v26
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
@@ -9521,13 +8571,15 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -9538,16 +8590,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -9558,12 +8605,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
@@ -9575,12 +8620,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
@@ -9592,12 +8635,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v21
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
@@ -9609,12 +8650,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
@@ -9626,12 +8665,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
@@ -9643,12 +8680,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
@@ -9660,12 +8695,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -9693,11 +8726,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
 ; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -9707,315 +8740,284 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX900-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX900-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v38, v35, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v32, v31, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX900-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX900-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
-; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
-; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
-; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
-; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
-; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX900-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
-; GFX900-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
-; GFX900-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v37, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v37
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v53
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v37, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v31
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v39, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v52, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v33, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v32
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX900-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v33
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v34, v33, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v35, v37
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v51, v50, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v50, v34, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
 ; GFX900-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
 ; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v27
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
 ; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v37, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
 ; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v39, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v50
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
 ; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v48, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v51
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
 ; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
 ; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v50, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
 ; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v51, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
 ; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v52, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v40
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
 ; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v53, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v18
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
 ; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v54, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
+; GFX900-NEXT:    v_and_b32_e32 v40, 0xffff0000, v17
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
-; GFX900-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
+; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
+; GFX900-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
 ; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v40, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v40, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
+; GFX900-NEXT:    v_and_b32_e32 v41, 0xffff0000, v16
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
 ; GFX900-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
-; GFX900-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
+; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
+; GFX900-NEXT:    v_lshlrev_b32_e32 v43, 16, v40
+; GFX900-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v42, v43
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v41, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -10025,12 +9027,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v40
 ; GFX900-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
@@ -10042,12 +9042,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v55
 ; GFX900-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v30
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
@@ -10059,12 +9057,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v55, v30
 ; GFX900-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v29
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
@@ -10076,12 +9072,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v30, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
@@ -10093,12 +9087,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v29, v28
 ; GFX900-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v27
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
@@ -10110,16 +9102,18 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v28, v27
 ; GFX900-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX900-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
@@ -10127,12 +9121,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v27, v26
 ; GFX900-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
@@ -10140,19 +9132,14 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
-; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
@@ -10164,12 +9151,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
@@ -10181,12 +9166,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v23
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
@@ -10198,12 +9181,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
@@ -10215,12 +9196,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v21
 ; GFX900-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
@@ -10232,12 +9211,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v20
 ; GFX900-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
@@ -10249,12 +9226,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v19
 ; GFX900-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
@@ -10266,12 +9241,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
@@ -10283,12 +9256,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v17
 ; GFX900-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
 ; GFX900-NEXT:    v_perm_b32 v0, v54, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v53, v1, s4
@@ -10303,484 +9274,430 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_perm_b32 v10, v36, v10, s4
 ; GFX900-NEXT:    v_perm_b32 v11, v35, v11, s4
 ; GFX900-NEXT:    v_perm_b32 v12, v34, v12, s4
-; GFX900-NEXT:    v_perm_b32 v13, v33, v13, s4
+; GFX900-NEXT:    v_perm_b32 v13, v32, v13, s4
 ; GFX900-NEXT:    v_perm_b32 v14, v31, v14, s4
-; GFX900-NEXT:    v_perm_b32 v15, v32, v15, s4
+; GFX900-NEXT:    v_perm_b32 v15, v33, v15, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximumnum_v32bf16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    scratch_load_dword v50, off, s32
+; GFX950-NEXT:    scratch_load_dword v51, off, s32
 ; GFX950-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX950-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v39
-; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v22
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v38, v35, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v31
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v21
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v37
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v32, v31, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v20
-; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v19
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX950-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v37, v35, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v48
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
 ; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
-; GFX950-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v54
 ; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
-; GFX950-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v37, v35, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v31
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v50
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v50
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v51
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v31, v39, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v51
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v33, v37
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v53, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v32
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v33, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v33
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v34, v33, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v35, v37
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v34
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v52, v50, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v36, v37
-; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v50, v34, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v34
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
 ; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v27
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v37, v38
-; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
+; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
 ; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v37, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v38, v39
-; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
+; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
 ; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v25
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v39, v48
-; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
+; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
 ; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v39, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v24
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v48, v49
-; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v50
+; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
 ; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v48, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v23
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v49, v51
-; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v48
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v52
+; GFX950-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
 ; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v49, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v22
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v48
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v52
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v49
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v49, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v51
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
 ; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
-; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v51, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v50, vcc
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v49
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v49, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v52, v53
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v50
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v50, v49, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v51, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v52, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v20
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v53, v52, vcc
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v50
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v53, v54
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v50, vcc
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v52, v50, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v55
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
 ; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v53, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v19
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v54, v55
-; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v55, v40
+; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
 ; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v18
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v55, v40
-; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v54, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
+; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
 ; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v55, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v17
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v55, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v17
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v54, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v41
-; GFX950-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
+; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v55
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v40, v55, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
+; GFX950-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v55, v54, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
 ; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v40, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
-; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
+; GFX950-NEXT:    v_and_b32_e32 v41, 0xffff0000, v16
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
+; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v55
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v40, v40, v55, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v42
-; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
+; GFX950-NEXT:    v_lshlrev_b32_e32 v43, 16, v40
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v41, v40, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v40
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v40, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v42, v43
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v40, v55, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v41, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v51
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v51, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v15, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v51
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v41, v40
-; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v40, v50, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX950-NEXT:    v_cndmask_b32_e32 v40, v51, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v40
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v51, v15, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v14
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
-; GFX950-NEXT:    v_perm_b32 v15, v32, v15, s0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT:    v_perm_b32 v15, v33, v15, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v50
-; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v50, v30, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v40, v51
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v30, v14, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v51
+; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v14
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v50
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v51, v14, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX950-NEXT:    v_perm_b32 v14, v31, v14, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v13
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v50, v30
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v51, v30
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v30
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v13
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v29
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX950-NEXT:    v_perm_b32 v13, v33, v13, s0
+; GFX950-NEXT:    v_perm_b32 v13, v32, v13, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
@@ -10790,14 +9707,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v30, v29
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v28
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
@@ -10813,14 +9727,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v29, v28
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v11
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v27
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
@@ -10836,14 +9747,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v28, v27
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v10
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v26
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
@@ -10859,14 +9767,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v27, v26
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v25
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
@@ -10882,14 +9787,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v26, v25
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
@@ -10905,14 +9807,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
@@ -10928,14 +9827,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v24, v23
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
@@ -10951,14 +9847,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v23, v22
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
@@ -10974,20 +9867,17 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v22, v21
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX950-NEXT:    v_perm_b32 v4, v51, v4, s0
+; GFX950-NEXT:    v_perm_b32 v4, v50, v4, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
@@ -10997,14 +9887,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v21, v20
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
@@ -11020,14 +9907,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v20, v19
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
@@ -11043,14 +9927,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v19, v18
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
@@ -11066,14 +9947,11 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v18, v17
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v16
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v55, v0, s0
@@ -11082,600 +9960,536 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX10-LABEL: v_maximumnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v33, v32, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v38, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v27
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v32
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v35, v34, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v12
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v48, v39, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v23
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v35, v32, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v33
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v70, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v38, v34, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
-; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX10-NEXT:    v_cndmask_b32_e32 v35, v39, v33, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v31, v38
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v26
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v53, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v35
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v39, v48
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v25
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v49, v50
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v52, v38, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v39, v39
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v50, v49, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v49, v39, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v52, v52
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v54, v53, s6
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v51, v55
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v49
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v49, s7
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v55, v55
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s8, v31, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v65, v64, s7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v34, v33, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v35, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v33
+; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v36, v37, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v38, v38
+; GFX10-NEXT:    v_and_b32_e32 v70, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v81, 0xffff0000, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v49, v48, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v50, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v34
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v70, v70
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v81, v81
+; GFX10-NEXT:    v_lshrrev_b32_e32 v85, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v37, v36, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v51, v51
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v10
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s20, 0, v31
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v48, v35, s4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v39, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v37
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v51, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s6, v39, v48
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, v49, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v53, v52, s7
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v25
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v48, v48
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v52, v39, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v51, v50, s7
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v53, v53
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v50, v49, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v55, v54, s7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v64, v55, s7
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v65, v65
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v48
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s8, v64, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v54, v51, s7
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v7
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s9, v52, v55
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v22
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v67, v66, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v52
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v64, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v65
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s9, v54, v67
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v66, v65, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v54, v54
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v67, v66, s7
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s7, v68, v69
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v54, v54
-; GFX10-NEXT:    v_lshrrev_b32_e32 v69, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v67, v66, s10
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v68, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v70, v69, s10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v67, v67
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v66, v54, s10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v71, v71
-; GFX10-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v69, v68, s10
-; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v3
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s11, v70, v81
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v67
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v66, v54, s10
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v65, v64, s10
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v66, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v68, v67, s10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v65, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v71, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v69, v69
-; GFX10-NEXT:    v_and_b32_e32 v70, 0xffff0000, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s12, v82, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v71, s10
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s10, v31, v51
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v64
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v67, v66, s10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v52
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s11, v69, v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v65
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s10, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v83, v82, s13
+; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s12, v71, v80
+; GFX10-NEXT:    v_lshrrev_b32_e32 v70, 16, v19
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v68, v68
+; GFX10-NEXT:    v_lshrrev_b32_e32 v71, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v82, v67, s13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v69, v69
+; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v68
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v71, v70, s13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v80, v80
+; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v18
-; GFX10-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v69
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v71, v69, s13
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v70, v70
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v81, v80, s13
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v82, v82
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v80, v70, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v70, v70, v69, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v71, v71
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v82, v81, s13
+; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, v82, v80, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v71, v71
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s13, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v70
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v71, v81, v82, s14
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s14, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v82
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s13, v81, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v69
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v70
+; GFX10-NEXT:    v_cndmask_b32_e64 v71, v80, v82, s14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v82
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s14, v81, v83
+; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v71
-; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s15, v31, v81
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v16
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s15, v80, v81
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v80, v80
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v83, v83, v81, s16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v81, v83, s16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v81
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s16, v31, v84
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v31, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v84, s17
-; GFX10-NEXT:    v_and_b32_e32 v85, 0xffff0000, v30
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v80, v80
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v81, v83, s16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v80
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s16, v81, v84
+; GFX10-NEXT:    v_and_b32_e32 v81, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v16
+; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v85, v84, s17
+; GFX10-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v85, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v84, v31, s17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v81
+; GFX10-NEXT:    v_cndmask_b32_e64 v84, v84, v81, s17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v84
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s17, v85, v86
-; GFX10-NEXT:    v_lshrrev_b32_e32 v86, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v84, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v31, v84, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v85
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s17, 0, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v37, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v84, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v37
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v32, v37, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v36, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v34
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v37, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v36
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v34, v36, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v35, v33, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v33
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v38
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v35
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v33, v35, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v48, v38, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v35, v38, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v38, v48, s4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v35
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v50, v39, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v65
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v48, v39, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v39, v50, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v52, v49, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v50, v49, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v49, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v52, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v55, v53, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v64, v65, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v69
-; GFX10-NEXT:    v_cndmask_b32_e64 v36, v52, v53, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v55, v65, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s8
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v66, v54, s11
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v54
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v65, v54, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v54, v66, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v65
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s8, 0, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v67, v68, s12
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v66, v68, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0, v67
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v68, v67, s9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v66
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v51, v69, s13
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v68, v69, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v69, v51, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v70, s14
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v70
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v69, v70, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v70, v80, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v71, v82, s15
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v80, v82, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v71
-; GFX10-NEXT:    v_cndmask_b32_e64 v71, v82, v71, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v81, v83, s16
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v83, v82, v83, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0, v81
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v83, v81, s10
-; GFX10-NEXT:    buffer_load_dword v83, off, s[0:3], s32
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v85, v85
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v85, v85
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v14, v30, s11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v85, v85
+; GFX10-NEXT:    v_cndmask_b32_e64 v85, v14, v30, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v85
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v30, v85, s18
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v87, v30, v85, s11
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v35, v38, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v50, v49, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v65, v54, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v84, v32, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v37
-; GFX10-NEXT:    v_and_b32_e32 v84, 0xffff0000, v15
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v34, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v48, v39, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v68
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s18, v86, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v86, v13, v29, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v86
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v13, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v87, v29, v86, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v87
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s19, v14, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v32, v31, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, v87, v86, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v13
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v13, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v34, v33, s4
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v33
+; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0, v85
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v29, v34, v33, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v48, v39, s9
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v38, v36, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v38, v36, vcc_lo
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v29
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v50, v49, s8
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v35, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v31, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v35
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v37, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v49
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v53, v51, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v50, v49, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v52, v54, s10
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v53, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v64, v55, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v65, v66, s12
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v52, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v64, v55, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v68, v67, s13
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v65, v66, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v70, v69, s14
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v68, v67, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v69
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v71, v82, s15
+; GFX10-NEXT:    v_cndmask_b32_e32 v64, v70, v69, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v82
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v83, s16
+; GFX10-NEXT:    v_cndmask_b32_e32 v66, v71, v82, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v83
+; GFX10-NEXT:    v_cndmask_b32_e64 v71, v84, v81, s17
+; GFX10-NEXT:    v_cndmask_b32_e32 v68, v80, v83, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v83, 0xffff0000, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v70, v84, v81, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v30, v85, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v30, v85, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v30, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v28
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, v12, v28, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v32
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v30, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v34
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v96, v28, v82, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v32, v31, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v34, v33, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v49
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v51
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v33
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v36, v35, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v49, v48, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v34
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v39, v37, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v67
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v69
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v55, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v51, v50, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v71
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v80
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v53, v52, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v55, v54, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v65, v64, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v83, v83
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v83
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v15, v83, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v66, v67, s9
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v86, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v50, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v83, v64, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v15, v38, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v39, v68, v51, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v67, v66, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v84, v37, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v38, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v37, v54, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e32 v48, v69, v70, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v53, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v66
-; GFX10-NEXT:    v_cndmask_b32_e32 v65, v55, v64, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v69, v68, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v50, v80, v71, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v65, v64, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v71, v70, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v52
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v38, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v64
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v39, v54, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v49, v53, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v65
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e32 v52, v82, v81, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v38, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v39, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v97
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v80, v81, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v65, v54, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v85
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v87, v85, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v53, v85, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v55, v87, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v53, v28, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v65
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v54, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v27
+; GFX10-NEXT:    v_perm_b32 v14, v14, v38, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v49, v52, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v82
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v96
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX10-NEXT:    v_perm_b32 v13, v14, v13, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v11
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v9
-; GFX10-NEXT:    v_perm_b32 v14, v31, v28, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v32, v12, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v86
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v87, v86, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v52, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v96, v82, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v97, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v26
+; GFX10-NEXT:    v_perm_b32 v13, v13, v27, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v82
+; GFX10-NEXT:    v_cndmask_b32_e32 v64, v96, v82, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v52, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v26, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v54
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v51, v64, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v25
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v52, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v64, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v53, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v54, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v24
+; GFX10-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
-; GFX10-NEXT:    v_perm_b32 v11, v33, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v29, v26, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v51, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v23
 ; GFX10-NEXT:    v_perm_b32 v10, v30, v10, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
 ; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_perm_b32 v9, v34, v9, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v53, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v22
+; GFX10-NEXT:    v_perm_b32 v9, v31, v9, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_perm_b32 v8, v35, v8, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_perm_b32 v7, v36, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX10-NEXT:    v_perm_b32 v8, v28, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v52, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v50
+; GFX10-NEXT:    v_perm_b32 v7, v32, v7, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v3
+; GFX10-NEXT:    v_perm_b32 v6, v33, v6, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX10-NEXT:    v_perm_b32 v5, v38, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX10-NEXT:    v_perm_b32 v5, v34, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX10-NEXT:    v_perm_b32 v3, v39, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX10-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v16, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v37, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v20, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_perm_b32 v2, v48, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v49, v51, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v48, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_perm_b32 v2, v15, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v15, v49, v39, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v35, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximumnum_v32bf16:
@@ -11685,1268 +10499,1139 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s40, v86, v118
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v128
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v33.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v55.l, v32.l, s1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s63, v116, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v129
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v86, v128
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v130
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v35.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v64.l, v33.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v65.l, v34.l, s3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v87, v129
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v131
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v96, v128
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s44, v99, v99
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s45, v100, v100
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v66.l, v35.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v68.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v97, v129
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v65.l, v34.l, s3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s43, v98, v98
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s42, v87, v118
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s43, v96, v119
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s45, v98, v129
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s56, v101, v132
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s59, v112, v135
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s60, v113, v144
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v133
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v64.l, v33.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v66.l, v35.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v68.l, v37.l, s6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v132
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v99, v129
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v55.l, v32.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s46, v101, v101
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v48.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v67.l, v36.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v69.l, v38.l, s7
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v98, v128
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v86, v100
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v68.l, v37.l, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v101
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v103
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v71.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v67.l, v36.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v69.l, v38.l, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v64
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v65
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v39.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v48.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v50.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v49.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v51.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v53.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v70.l, v39.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v71.l, v48.l, s9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v113
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v81.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v87, v101
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v84.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v96, v103
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v67
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v69
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v52.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v80.l, v49.l, s10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v114
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v116
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v85.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v97, v112
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v71.l, v48.l, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s43
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v51.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s46, v99, v130
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s61, v114, v145
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s62, v115, v146
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v33.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v52.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v70.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v81.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v97, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v35.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v81.l, v50.l, s11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v115
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v84.l, v53.l, s14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v117
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s57, v102, v133
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v65.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v66.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s47, v100, v131
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s58, v103, v134
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v38.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v49.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v64.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v71.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v80.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v85.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v67.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v84.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v82.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v83.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v69.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v98, v113
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s14, v87, v101
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v80.l, v49.l, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.l, v48.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v11.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s44
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v82.l, v51.l, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v83.l, v52.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.h, v85.l, v54.l, s15
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v99, v114
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v86, v100
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s15, v96, v103
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v70.l, v39.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v81.l, v50.l, s11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v84.l, v53.l, s14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v80
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v12.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v29.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v82.l, v51.l, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v83.l, v52.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v85.l, v54.l, s15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.l, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v81
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v80
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v36.l, v30.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v119
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.l, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v71
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v81
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v29.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s46
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v118
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s17, v116, v115
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v70
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v83
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v84
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v71
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v85
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v28.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v37.l, v32.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v49.l, v37.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s16, v112, v97
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v70
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v82
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v83
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v84
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v39.l, v35.h, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v28.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.l, v14.l, s16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v38.l, v33.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v50.l, v38.h, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v86
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v54.l, v50.h, s15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v31
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v50.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v52, v53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v15.h, v31.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v66, v66
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v64.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v38, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v31.h, v64.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v48.l, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v31.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v67, v69
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v66, v68
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v65.l, v64.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v15.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v64.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v31.l, v15.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v65.l, v64.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v29.l, v13.l, s17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v51.l, v39.h, s12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v31.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v37
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v53.l, v49.h, s14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v52.l, v48.h, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v102, v102
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v52, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v30.l, v14.l, s0
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v29.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v27, v26
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v48, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v29.l, v13.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v28.l, v12.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v55.l, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v49, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.l, v26.l, v10.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v27.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v31.l, v10.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v26.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v28.l, v12.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v49, v48
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v25.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.l, v11.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v26.l, v10.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v29.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v30.l, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v49, v48
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v24.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v25.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v24.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v26.l, v10.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v49, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v23.l, v7.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v25.l, v9.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v24.l, v8.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v25, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v27.l, v7.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v22.l, v6.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v10.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v24.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v21.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v10.l, v7.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v27, v26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v6.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v9.l, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v23.l, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v20.l, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v8.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v6.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v8.l, v5.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v7.l, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v23, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v18.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v17.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v16.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.l, v4.l, s3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v6.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v7.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v18
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v17.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v8.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v10.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v10.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v1.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v2.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v19.l, v0.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v18.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v17.l, v1.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v16.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v5.l, v2.l, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v38 :: v_dual_mov_b32 v1, v37
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v33 :: v_dual_mov_b32 v5, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v31 :: v_dual_mov_b32 v7, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v29 :: v_dual_mov_b32 v9, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v10, v34
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v32bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v116, v115, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v128, v119, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v132, v131, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v144, v135, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v39
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v80
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v55, v64, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v51
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v54
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v96
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v114
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v116
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v69, v71, v80, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v71, v83, v84, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v83, v87, v96 :: v_dual_and_b32 v98, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v118
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v99, v100, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v97, v103, v112, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v112
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v101, v115, v114, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v116
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v119, v116, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v118, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v129, v135, v128, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v118
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v147, v34, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v49, v130
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v37, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v131
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v132
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v69, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v81, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v85, v135
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v97, v144
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v101, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v112, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v114, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v116, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v118, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v128, v49
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v30, v14 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v68
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v84
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v35, v36, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v133, v39, v48, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v135, v51, v52 :: v_dual_lshlrev_b32 v50, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v145, v54, v64, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v70
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v80
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v82
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v84
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v96
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v98
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v35
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v102
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v39
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v147, v65, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v37, v132
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v132, v69, v80, s0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v49, v134
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v65
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v134, v71, v84, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e64 vcc_lo, 0, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v114
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v53, v144
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v144, v83, v96, s2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v55, v146
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v129, v128, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v34
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v67
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v71
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v83
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v87
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v99
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v103
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v113
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v115
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v119
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v146, v86, v100, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v80
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v67, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v54, v64, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v38, v34 :: v_dual_lshlrev_b32 v53, 16, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v86
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v97, v112, s4
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s4, v70, v49
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v101, v114, s5
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s5, v81, v53
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v65, v68, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v100
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v97
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v71, v84, s5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v112
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v84, v30, v14 :: v_dual_lshlrev_b32 v85, 16, v96
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v87, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v113, v116, s6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v86, v100 :: v_dual_lshlrev_b32 v36, 16, v117
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v99, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e64 s6, v85, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v113
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v97, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v103, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v83, v96, s6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v101, v114, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v115, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v69, v80, s4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v113, v116 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v119, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v69
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v117, v118 :: v_dual_lshlrev_b32 v87, 16, v71
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v130, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v53
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v52, v129, v128 :: v_dual_lshlrev_b32 v101, 16, v55
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v131, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v117, v118, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v80
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v82, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v83
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v35, v35, v98 :: v_dual_lshlrev_b32 v68, 16, v51
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v39, v133, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v135, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v80
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v54, v145, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v82
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v65, v147, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v86
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v69, v132, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v71, v134 :: v_dual_and_b32 v69, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v96
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v83, v144, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v97
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v146, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v99
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v53, v70, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v55, v81, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v67
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v33
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v64, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v103
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v36, v85, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v67, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v112
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v52, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v31
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v113
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v67, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v36, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v32
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v36, v32 :: v_dual_lshlrev_b32 v31, 16, v50
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v84, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v31
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v50, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v52, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v31, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v29, v28, v12 :: v_dual_lshlrev_b32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v30, v13 :: v_dual_lshlrev_b32 v32, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v30
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v10 :: v_dual_lshlrev_b32 v30, 16, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v51, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v29, 16, v25
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v32, v29
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v9 :: v_dual_lshlrev_b32 v29, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v30, v28
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v24, v8 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v65, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v27, 16, v23
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v25, v8 :: v_dual_lshlrev_b32 v27, 16, v22
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v66, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v27
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v22, v6 :: v_dual_lshlrev_b32 v27, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v68, v7, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v20 :: v_dual_lshlrev_b32 v25, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v22
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v21, v5 :: v_dual_lshlrev_b32 v25, 16, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v4 :: v_dual_lshlrev_b32 v23, 16, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v22
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v18, v2 :: v_dual_lshlrev_b32 v25, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v55, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v16, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_lshlrev_b32 v23, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v48, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v20, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v53, v4, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v32bf16:
@@ -12960,747 +11645,660 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v55.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v64.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v65.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v128
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v33.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v34.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v55.l, v32.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v66.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v129
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v86, v128
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v130
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v29
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v28
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v35.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v64.l, v33.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v65.l, v34.l, s3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v87, v129
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v131
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v96, v128
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s44, v99, v99
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s45, v100, v100
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v66.l, v35.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v68.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v97, v129
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v65.l, v34.l, s3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s43, v98, v98
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v37.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v67.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v69.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v133
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v64.l, v33.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v66.l, v35.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v34.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v38.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v68.l, v37.l, s6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v132
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v99, v129
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v55.l, v32.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v33.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s46, v101, v101
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v48.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v67.l, v36.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v69.l, v38.l, s7
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v98, v128
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v86, v100
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v68.l, v37.l, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v32.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v66
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v49.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v101
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v70.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v103
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v71.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v67.l, v36.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v69.l, v38.l, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v35.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v37.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v64
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v65
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v39.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v48.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v50.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v80.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v36.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v49.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s40, v86, v118
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v51.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v53.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v70.l, v39.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v71.l, v48.l, s9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v113
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v81.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v87, v101
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v84.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v96, v103
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v67
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v69
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v50.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v52.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v53.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v80.l, v49.l, s10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v114
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v82.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v83.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v116
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v85.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v97, v112
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s63, v116, v86
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v55.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s42, v87, v118
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s43, v96, v119
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s45, v98, v129
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s56, v101, v132
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s59, v112, v135
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s60, v113, v144
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v39.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v50.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v71.l, v48.l, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v68
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s43
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v51.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s46, v99, v130
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s61, v114, v145
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s62, v115, v146
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v33.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v52.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v53.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v70.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v81.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v54.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v97, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v35.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v81.l, v50.l, s11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v115
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v84.l, v53.l, s14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v117
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s57, v102, v133
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v65.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v66.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s47, v100, v131
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s58, v103, v134
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v38.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v49.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v64.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v71.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v80.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v85.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v67.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v84.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v82.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v83.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v69.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v98, v113
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s14, v87, v101
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v80.l, v49.l, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v80.l, v48.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v11.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s44
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v82.l, v51.l, s12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v83.l, v52.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.h, v85.l, v54.l, s15
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v99, v114
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v86, v100
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s15, v96, v103
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v70.l, v39.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v81.l, v50.l, s11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v84.l, v53.l, s14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v49.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v80
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v12.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v13.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v29.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v82.l, v51.l, s12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v83.l, v52.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v85.l, v54.l, s15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v39.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v85.l, v53.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v81
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v80
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v36.l, v30.h, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s45
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v14.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v30.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v119
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v70.l, v38.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v50.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v51.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v84.l, v52.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v71
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v81
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v29.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s46
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v118
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s17, v116, v115
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v70
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v83
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v84
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v71
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v85
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v28.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v37.l, v32.h, s6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v49.l, v37.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s16, v112, v97
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v70
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v82
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v83
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v84
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v39.l, v35.h, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v28.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.l, v14.l, s16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v38.l, v33.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v50.l, v38.h, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v55.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v86
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v54.l, v50.h, s15
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v31
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v15.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v15.h, v31.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v66, v66
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v15.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v64.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v50.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v52, v53
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v51.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v38, v53
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v31.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v31.h, v64.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v48.l, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v31.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v65.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v67, v69
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v66, v68
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v52, v51
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v65.l, v64.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v15.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v64.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v33.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v32.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v31.l, v15.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v65.l, v64.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v29.l, v13.l, s17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v51.l, v39.h, s12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v31.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v37
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v53.l, v49.h, s14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v52.l, v48.h, s13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v102, v102
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v14.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v30.l, v14.l, s0
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v51, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v29.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v28.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v51, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v27, v26
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v48, v39
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v49
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v29.l, v13.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v28.l, v12.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v27.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v29.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v55.l, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v49, v48
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.l, v26.l, v10.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v27.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v31.l, v10.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v26.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v28.l, v12.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v49, v48
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v25.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.l, v11.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v26.l, v10.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v25.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v27.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v29.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v30.l, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v49, v48
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v24.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v8.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v25.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v24.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v26.l, v10.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v49, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v23.l, v7.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v25.l, v9.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v24.l, v8.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v9.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v25, v24
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v27.l, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v22.l, v6.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v10.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v24.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v21.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v10.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v27, v26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v6.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v9.l, v5.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v23.l, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v20.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v8.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v6.l, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v8.l, v5.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v7.l, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v23, v21
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v18.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v17.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v20
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v16.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.l, v4.l, s3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v6.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v7.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v20, v18
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v17.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v8.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v10.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v18.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v10.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v1.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v2.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v19.l, v0.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v18.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v17.l, v1.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v16.l, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v5.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v38 :: v_dual_mov_b32 v1, v37
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v35
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v33 :: v_dual_mov_b32 v5, v32
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v31 :: v_dual_mov_b32 v7, v30
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v29 :: v_dual_mov_b32 v9, v28
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v10, v34
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v32bf16:
@@ -13710,792 +12308,683 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v116, v115, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v128, v119, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v132, v131, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v144, v135, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v39
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v80
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v55, v64, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v51
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v84
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v54
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v96
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v114
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v116
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v69, v71, v80, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v71, v83, v84, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v83, v87, v96 :: v_dual_and_b32 v98, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v118
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v99, v100, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v97, v103, v112, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v112
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v101, v115, v114, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v116
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v119, v116, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v118, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v129, v135, v128, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v118
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v147, v34, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v49, v130
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v37, v129
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v53, v131
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v132
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v69, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v81, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v85, v135
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v97, v144
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v101, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v112, v146
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v114, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v116, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v118, v30
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v30, v14 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v128, v49
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v84
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v36
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v35, v36, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v68
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v70
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v82
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v86
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v96
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v98
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v35
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v102
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v39
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v34
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v51
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v67
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v71
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v83
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v99
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v113
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v119
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v38
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v133, v39, v48, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v135, v51, v52 :: v_dual_lshlrev_b32 v50, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v64
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v145, v54, v64, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v147, v65, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v37, v132
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v132, v69, v80, s0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v49, v134
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v65
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v134, v71, v84, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e64 vcc_lo, 0, v128
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v114
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v53, v144
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v144, v83, v96, s2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s2, v55, v146
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v129, v128, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v34
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v69
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v146, v86, v100, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v80
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s3, v67, v37
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v54, v64, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v38, v34 :: v_dual_lshlrev_b32 v53, 16, v71
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v86
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v97, v112, s4
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s4, v70, v49
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v101, v114, s5
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s5, v81, v53
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v65, v68, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v100
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v97
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v71, v84, s5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v112
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v84, v30, v14 :: v_dual_lshlrev_b32 v85, 16, v96
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v87, v37
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v101
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v113, v116, s6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v86, v100 :: v_dual_lshlrev_b32 v36, 16, v117
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v99, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e64 s6, v85, v55
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v113
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v97, v112, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v103, v53
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v128
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v38
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v83, v96, s6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v101, v114, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v115, v55
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v69, v80, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v113, v116 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v119, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v69
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v117, v118 :: v_dual_lshlrev_b32 v87, 16, v71
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v130, v52
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v37
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v53
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v52, v129, v128 :: v_dual_lshlrev_b32 v101, 16, v55
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v131, v68
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v36
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v117, v118, s7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v52
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v80
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v54
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v34
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v82, v96
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v65
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v83
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v35, v35, v98 :: v_dual_lshlrev_b32 v68, 16, v51
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v66
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v39, v133, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v135, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v80
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v54, v145, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v82
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v65, v147, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v86
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v64
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v69, v132, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v67
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v71, v134 :: v_dual_and_b32 v69, 0xffff0000, v31
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v96
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v83, v144, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v97
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v33
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v146, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v99
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v67, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v52
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v53, v70, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v101
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v53
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v55, v81, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v67, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v103
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v36, v85, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v64, v53
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v67, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v112
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v36
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v52, v48, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v31
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v113
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v64, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v67, v69
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v66
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v36, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v65, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v32
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v36, v32 :: v_dual_lshlrev_b32 v31, 16, v50
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v84, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v31
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
+; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v66, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v50, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v52, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v31, v15, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v29, v28, v12 :: v_dual_lshlrev_b32 v50, 16, v11
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v30, v13 :: v_dual_lshlrev_b32 v32, 16, v27
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v38, v13, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v54, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v50, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v30
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v10 :: v_dual_lshlrev_b32 v30, 16, v8
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v51, v11, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v55, v54
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v29, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v32, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v9 :: v_dual_lshlrev_b32 v29, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
+; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v54, v10, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v30, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v24, v8 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v65, v9, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v27, 16, v23
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v26
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v29, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v25, v8 :: v_dual_lshlrev_b32 v27, 16, v22
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v66, v8, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v28, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v22, v6 :: v_dual_lshlrev_b32 v27, 16, v20
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v68, v7, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v22
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v20 :: v_dual_lshlrev_b32 v25, 16, v5
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v21, v5 :: v_dual_lshlrev_b32 v25, 16, v19
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v4 :: v_dual_lshlrev_b32 v23, 16, v26
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v23, v19
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v24, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v18, v2 :: v_dual_lshlrev_b32 v25, 16, v16
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v25, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v55, v3, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v16, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v18
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_lshlrev_b32 v23, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v48, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v20, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v33, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v53, v4, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <32 x bfloat> @llvm.maximumnum.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y)
   ret <32 x bfloat> %result
@@ -14526,12 +13015,10 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14540,21 +13027,19 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14562,27 +13047,24 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14600,9 +13082,7 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -14628,15 +13108,12 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14658,10 +13135,8 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -14694,18 +13169,14 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14732,14 +13203,11 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@@ -14772,22 +13240,20 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14797,12 +13263,10 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -14814,22 +13278,20 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14839,12 +13301,10 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
@@ -14856,27 +13316,24 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
@@ -14889,14 +13346,11 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
@@ -14925,18 +13379,14 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
@@ -14968,14 +13418,13 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
@@ -14984,20 +13433,16 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15011,39 +13456,36 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15079,11 +13521,10 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
@@ -15096,17 +13537,14 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -15128,46 +13566,41 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -15207,22 +13640,20 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15232,12 +13663,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15249,12 +13678,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -15266,22 +13693,20 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15291,12 +13716,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15308,12 +13731,10 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
@@ -15325,27 +13746,24 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -15358,14 +13776,11 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
@@ -15381,14 +13796,11 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
@@ -15425,29 +13837,23 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -15457,65 +13863,65 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -15524,59 +13930,53 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -15588,76 +13988,70 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -15670,75 +14064,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -15780,41 +14166,37 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15824,12 +14206,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15841,12 +14221,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -15860,41 +14238,37 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15904,12 +14278,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15921,12 +14293,10 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
@@ -15939,51 +14309,46 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -15996,14 +14361,11 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -16019,14 +14381,11 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
@@ -16055,57 +14414,49 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v14
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -16116,84 +14467,76 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v11
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v9, v14
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v13, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
@@ -16219,63 +14562,54 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16289,98 +14623,86 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v10, v11
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v9, v14
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v13, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
@@ -16413,78 +14735,69 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v10
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index b97239081ac77..b94270f64eea4 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -33,14 +33,11 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -49,22 +46,19 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimumnum_bf16:
@@ -72,27 +66,24 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_bf16:
@@ -108,11 +99,9 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -137,16 +126,13 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -167,11 +153,9 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -203,19 +187,15 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -243,13 +223,10 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@@ -276,53 +253,42 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimumnum_bf16_nnan:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimumnum_bf16_nnan:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_bf16_nnan:
@@ -332,11 +298,9 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -349,15 +313,13 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -369,13 +331,10 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -392,18 +351,15 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -420,16 +376,12 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
@@ -461,23 +413,20 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -487,12 +436,10 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -504,23 +451,20 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -530,12 +474,10 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
@@ -547,27 +489,24 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
@@ -580,15 +519,11 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
@@ -617,18 +552,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
@@ -660,36 +591,31 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -703,39 +629,36 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -771,34 +694,30 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -820,46 +739,41 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -893,28 +807,23 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v3, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -924,28 +833,23 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -956,36 +860,29 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -994,28 +891,24 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1026,27 +919,23 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
@@ -1058,32 +947,26 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v1, v0 :: v_dual_and_b32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1099,29 +982,26 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.h
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
@@ -1137,39 +1017,30 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v1, v0 :: v_dual_and_b32 v5, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v4 :: v_dual_lshlrev_b32 v5, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1207,23 +1078,20 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1233,12 +1101,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -1250,12 +1116,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1267,23 +1131,20 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1293,12 +1154,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -1310,12 +1169,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
@@ -1327,27 +1184,24 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -1360,14 +1214,11 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
@@ -1383,15 +1234,11 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
@@ -1428,29 +1275,23 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1460,65 +1301,65 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1527,59 +1368,53 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16:
@@ -1591,76 +1426,70 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1673,75 +1502,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -1777,38 +1598,31 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1820,39 +1634,32 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1863,17 +1670,14 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
@@ -1881,73 +1685,60 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3bf16_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1956,86 +1747,77 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v9, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v10, v7 :: v_dual_lshlrev_b32 v2, 16, v6
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -2046,49 +1828,44 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v9, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.h
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -2098,57 +1875,45 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v2, v0 :: v_dual_lshlrev_b32 v4, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v10, v7 :: v_dual_lshlrev_b32 v2, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -2190,42 +1955,37 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2235,12 +1995,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2252,12 +2010,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2271,42 +2027,37 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2316,12 +2067,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -2333,12 +2082,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
@@ -2351,51 +2098,46 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -2408,20 +2150,17 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
@@ -2431,15 +2170,11 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
@@ -2468,57 +2203,49 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v14
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2529,84 +2256,76 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v11
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v9, v14
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v13, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -2632,63 +2351,54 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2702,98 +2412,86 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v11
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v9, v14
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v13, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -2826,78 +2524,69 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
@@ -2939,54 +2628,45 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX8-LABEL: v_minimumnum_v4bf16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3000,52 +2680,43 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
@@ -3057,67 +2728,54 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_perm_b32 v1, v1, v4, s0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
-; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX950-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v1, v4, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3126,54 +2784,46 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3182,109 +2832,93 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v7, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v8, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v5, v6
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v10, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.l, v1.h, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v7, v5 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_and_b32 v9, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v2, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v13, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v11 :: v_dual_lshlrev_b32 v2, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3297,61 +2931,50 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v7, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v8, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v5, v6
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v10, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v3.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.l, v1.h, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3361,74 +2984,61 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v7, v5 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_and_b32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v4, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v2, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v13, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v11 :: v_dual_lshlrev_b32 v2, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
   ret <4 x bfloat> %result
@@ -3483,61 +3093,54 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3547,12 +3150,10 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -3564,12 +3165,10 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
@@ -3581,12 +3180,10 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3603,61 +3200,54 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
 ; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
@@ -3667,12 +3257,10 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -3684,12 +3272,10 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
@@ -3701,12 +3287,10 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v8, v0, s4
@@ -3721,76 +3305,70 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v5
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
-; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v10
-; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
 ; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX950-NEXT:    s_nop 0
@@ -3803,20 +3381,17 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v2, v6, v2, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
@@ -3826,20 +3401,17 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v1, v7, v1, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
@@ -3849,17 +3421,12 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v7, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
-; GFX950-NEXT:    v_perm_b32 v2, v6, v2, s0
-; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v8, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
@@ -3874,108 +3441,96 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_sdwa v12, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v15, v14, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v10, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v14, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v8, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v14, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v13, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v8, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX10-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX10-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v2, v7, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3990,119 +3545,106 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v13, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v14, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v9.l, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v9.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v7
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v10, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v15, v14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v12.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v2.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.l, v1.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.l, v2.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v6bf16:
@@ -4112,115 +3654,104 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v8, v7 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v7, v6 :: v_dual_lshlrev_b32 v13, 16, v7
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v11 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_lshlrev_b32 v8, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v13, v7 :: v_dual_lshlrev_b32 v14, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v12, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v8, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v4, v1 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_lshlrev_b32 v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4239,136 +3770,124 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v11, v11
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v13, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v14, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v14, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v10.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v12.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v9.l, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v9.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v7
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v10, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.l, v10.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v15, v14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v9.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v12.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v2.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.l, v1.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v8.l, v2.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v6bf16:
@@ -4382,146 +3901,131 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v8, v7 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v10, v9 :: v_dual_and_b32 v11, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v7, v6 :: v_dual_lshlrev_b32 v13, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v12, v11 :: v_dual_lshlrev_b32 v12, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v12 :: v_dual_lshlrev_b32 v14, 16, v9
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v14
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_lshlrev_b32 v8, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v10, v6 :: v_dual_lshlrev_b32 v13, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v8 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v13, v7 :: v_dual_lshlrev_b32 v14, 16, v9
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v12, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v8, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v13, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v11, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v0 :: v_dual_lshlrev_b32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v5, v2 :: v_dual_lshlrev_b32 v10, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v4, v1 :: v_dual_lshlrev_b32 v11, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v9, v2 :: v_dual_lshlrev_b32 v13, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v11
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v4, v1 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v3, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v1 :: v_dual_lshlrev_b32 v10, 16, v11
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v4, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v3 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v8, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x bfloat> @llvm.minimumnum.v6bf16(<6 x bfloat> %x, <6 x bfloat> %y)
@@ -4589,80 +4093,71 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4672,12 +4167,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -4689,12 +4182,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
@@ -4706,12 +4197,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -4723,12 +4212,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -4747,80 +4234,71 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
@@ -4830,12 +4308,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
@@ -4847,12 +4323,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
@@ -4864,12 +4338,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
@@ -4881,12 +4353,10 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v11, v0, s4
@@ -4902,101 +4372,92 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v11
-; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
 ; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v12
-; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v13
-; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v14
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; GFX950-NEXT:    s_nop 0
@@ -5009,20 +4470,17 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v3, v8, v3, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
@@ -5032,20 +4490,17 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v7
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v2, v9, v2, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
@@ -5055,20 +4510,17 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v1, v10, v1, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
@@ -5078,17 +4530,12 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v4, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v10, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
-; GFX950-NEXT:    v_perm_b32 v2, v9, v2, s0
-; GFX950-NEXT:    v_perm_b32 v3, v8, v3, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v11, v0, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
@@ -5103,7 +4550,7 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
@@ -5115,134 +4562,118 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v9
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v10, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v16, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v15, v3, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5250,171 +4681,152 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v5.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v19, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v13.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v10.l, v8.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v15, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v13, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v11, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v19, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v10.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v12.l, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v11.l, v9.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v13.l, v12.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.l, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v9.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v10.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v15, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v7.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v6.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v5.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v4.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v2.l, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.l, v1.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v0.l, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v9.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v7
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v8bf16:
@@ -5424,7 +4836,7 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -5433,139 +4845,131 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v12, v11 :: v_dual_lshlrev_b32 v13, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v9, v8 :: v_dual_and_b32 v15, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v11, v10 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v15
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v13, v11 :: v_dual_lshlrev_b32 v17, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v11 :: v_dual_lshlrev_b32 v14, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v7, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v5, v1 :: v_dual_lshlrev_b32 v13, 16, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v15, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5577,201 +4981,178 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v8.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v15, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v5.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v19, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v13.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v13, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v11.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v10.l, v8.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v15, v15
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v11, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v9.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v19, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v10.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v12.l, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v20
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v11.l, v9.l, s3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v12.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v13.l, v12.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v15.l, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v9.h, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v10.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v15, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v4.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v7.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v6.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v10.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v5.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v4.l, v0.l, s0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v2.l, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.l, v1.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.l, v0.l, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v9.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v7
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v8bf16:
@@ -5785,7 +5166,7 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -5796,183 +5177,166 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v8 :: v_dual_and_b32 v10, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v12, v11 :: v_dual_lshlrev_b32 v13, 16, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v9, v8 :: v_dual_and_b32 v15, 0xffff0000, v6
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v12, v8 :: v_dual_and_b32 v13, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v11, v10 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v15, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v15
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v11, v10 :: v_dual_and_b32 v15, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_lshlrev_b32 v13, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v13, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v9
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v9 :: v_dual_lshlrev_b32 v18, 16, v14
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v13, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v14, v10 :: v_dual_and_b32 v13, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_lshlrev_b32 v16, 16, v13
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v10
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v12
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v12 :: v_dual_lshlrev_b32 v16, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v15, v12 :: v_dual_lshlrev_b32 v12, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v13, v11 :: v_dual_lshlrev_b32 v17, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v7, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v12, v3 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v12, v11 :: v_dual_lshlrev_b32 v14, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_lshlrev_b32 v14, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v13, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v14, v2 :: v_dual_lshlrev_b32 v15, 16, v1
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v7, 16, v5
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v5, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v16
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v7, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v4, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v5, v1 :: v_dual_lshlrev_b32 v13, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_lshlrev_b32 v5, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_lshlrev_b32 v16, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y)
@@ -6090,156 +5454,139 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v19
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6249,12 +5596,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v7
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
@@ -6266,12 +5611,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v6
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
@@ -6283,12 +5626,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v5
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
@@ -6300,12 +5641,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
@@ -6317,12 +5656,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -6334,13 +5671,11 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -6351,12 +5686,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -6368,12 +5701,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -6400,156 +5731,139 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v19
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
 ; GFX900-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
 ; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
-; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
 ; GFX900-NEXT:    v_cndmask_b32_e32 v19, v17, v18, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
-; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v20, v18, v19, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
-; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v19
 ; GFX900-NEXT:    v_cndmask_b32_e32 v21, v19, v20, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
 ; GFX900-NEXT:    v_cndmask_b32_e32 v22, v20, v21, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v21, v22, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
-; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v22, v23, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
-; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
 ; GFX900-NEXT:    v_cndmask_b32_e32 v25, v23, v24, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
@@ -6559,12 +5873,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v7
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
@@ -6576,12 +5888,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v15
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v6
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
@@ -6593,12 +5903,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v5
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
@@ -6610,12 +5918,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v4
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
@@ -6627,12 +5933,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v3
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
@@ -6644,12 +5948,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v2
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
@@ -6661,12 +5963,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v1
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
@@ -6678,12 +5978,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v23, v0, s4
@@ -6703,198 +6001,180 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v15
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v19
-; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v16
-; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
-; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v17
-; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
 ; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX950-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v20
-; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v17
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v19, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v18
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v13
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v18
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v21
-; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v18
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v22
-; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v19
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v21, 16, v11
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v23
-; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v20
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v22, 16, v10
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v22, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v10
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v24
-; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v21
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v23, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v25
-; GFX950-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v22
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v23
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX950-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v24, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v8
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v26
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v24
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v23, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v23
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v27
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
 ; GFX950-NEXT:    s_nop 0
@@ -6907,20 +6187,17 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v7, v16, v7, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
@@ -6930,20 +6207,17 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v15
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v6, v17, v6, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
@@ -6953,20 +6227,17 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v14
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v15
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v5, v18, v5, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
@@ -6976,20 +6247,17 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v13
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v12, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v14
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v4, v19, v4, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
@@ -6999,20 +6267,17 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v12
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v11, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v13
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v3, v20, v3, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
@@ -7022,20 +6287,17 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v11
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v12
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v2, v21, v2, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
@@ -7045,20 +6307,17 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v10
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v11
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v1, v22, v1, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
@@ -7068,23 +6327,14 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v9
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v8, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v10
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v22, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
-; GFX950-NEXT:    v_perm_b32 v2, v21, v2, s0
-; GFX950-NEXT:    v_perm_b32 v3, v20, v3, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v23, v0, s0
-; GFX950-NEXT:    v_perm_b32 v4, v19, v4, s0
-; GFX950-NEXT:    v_perm_b32 v5, v18, v5, s0
-; GFX950-NEXT:    v_perm_b32 v6, v17, v6, s0
-; GFX950-NEXT:    v_perm_b32 v7, v16, v7, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v16bf16:
@@ -7098,13 +6348,10 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
@@ -7113,278 +6360,249 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v5
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v23, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v20
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v19, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v18
 ; GFX10-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v20, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v26
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v18, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v24, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v22, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v20, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v25, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v21
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v23, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v23, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v25, v24, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v27, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v29
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v21, v24, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v26, v23, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v24, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v27, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v27
 ; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v23, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v30, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v29, v25, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v24, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v26, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v25, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
-; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v27, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v26
+; GFX10-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX10-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
 ; GFX10-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v8
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX10-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX10-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX10-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7392,340 +6610,309 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v15.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v17.h, v14.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v20, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v5.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v24
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v25, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v18.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v20.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v25
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v4.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v6.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v12.h, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v18.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v21.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v2.h, v10.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v10.h, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v19.l, v18.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v21.l, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v18.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v24, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v1.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v6.l, v22.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v23.l, v7.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v9.h, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v21.l, v20.l, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v22.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v26, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v27, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v23, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v0.h, v8.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v18.l, v20.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v7.l, v19.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v24.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v21.l, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v21, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v23, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v19.l, s0
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v23, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v25, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v21.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v21.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v18.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v17.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v16.l
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v15.l, v16.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v14.l, v17.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v15.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v17.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v14.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v19.l, v6.l, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v18, v17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v13.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v13.l, v5.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v17.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v23, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v16.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v10.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v22, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s3
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v13.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v15, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v5.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v22, v17
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v24, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v15.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v9.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v8.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v9.l, v1.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v2.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v11.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.l, v0.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v15.l, v1.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v10.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v20
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v16 :: v_dual_mov_b32 v4, v19
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v18
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v16bf16:
@@ -7735,305 +6922,273 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v19
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v18, v16 :: v_dual_and_b32 v21, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v23, v22 :: v_dual_lshlrev_b32 v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v19, v20 :: v_dual_and_b32 v23, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v20 :: v_dual_lshlrev_b32 v26, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v18, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v24, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v23, v18 :: v_dual_and_b32 v21, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v22, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v20, 16, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v29
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v19 :: v_dual_and_b32 v24, 0xffff0000, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v23, v21 :: v_dual_lshlrev_b32 v28, 16, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v23, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v23 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v27 :: v_dual_and_b32 v25, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v24 :: v_dual_lshlrev_b32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v30, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v23, 16, v24
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v29, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v24, v22 :: v_dual_lshlrev_b32 v23, 16, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v27 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v27
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v14, v6 :: v_dual_lshlrev_b32 v15, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v13, v5 :: v_dual_lshlrev_b32 v25, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v12, v4 :: v_dual_lshlrev_b32 v15, 16, v14
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v14, 16, v10
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8045,405 +7200,361 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v14
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v15.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v17.h, v14.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v13
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v20, v21
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v5.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v19.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v7.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v24, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v24
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v7.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v18.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v25, v26
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v6.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v18.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v20.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v25
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v20.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v22.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v26, v21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v4.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v12.h, v18.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v18.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v20.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v21.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v2.h, v10.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v21.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v18.l, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v7.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v10.h, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v19.l, v18.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v22.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v21.l, v20.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v18.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v24, v25
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v1.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v20.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v6.l, v22.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v23.l, v7.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v9.h, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v21.l, v20.l, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v24.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v22.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v27, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v23, v25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v0.h, v8.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v18.l, v20.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v7.l, v19.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v24.l, v6.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v21.l, s1
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v21, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v18.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v19.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v23, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v19.l, s0
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v23, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v15.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v25, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v21.l, s0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v21.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v18.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v15.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v17.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v17.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v22
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v14.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v15.l, v16.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v14.l, v17.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v13.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v15.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v19.l, v6.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v13.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v17.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v13.l, v5.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v14.l, v17.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v18, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v23, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v3.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v16.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v10.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v22, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s3
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v15
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v13.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v15, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v5.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v22, v17
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v24, v23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v15.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v9.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v8.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v14.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v17
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v9.l, v1.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v2.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v11.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.l, v0.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v15.l, v1.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v10.l, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v20 :: v_dual_mov_b32 v3, v16
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v19 :: v_dual_mov_b32 v5, v18
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v16bf16:
@@ -8457,18 +7568,16 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc_lo
@@ -8480,375 +7589,323 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v18
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v21, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v17, 16, v18
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v21, v22
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v18, v16 :: v_dual_and_b32 v21, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v19, v20 :: v_dual_and_b32 v18, 0xffff0000, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v23, v22 :: v_dual_lshlrev_b32 v18, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v19, v20 :: v_dual_and_b32 v23, 0xffff0000, v13
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v17, v18
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v22, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v24 :: v_dual_lshlrev_b32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v20 :: v_dual_lshlrev_b32 v26, 16, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v25
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v18, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v18 :: v_dual_lshlrev_b32 v26, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v19, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v23, v18 :: v_dual_and_b32 v21, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v22, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v19, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v21 :: v_dual_lshlrev_b32 v27, 16, v19
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v19, v22, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v20, 16, v22
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v22, v20 :: v_dual_and_b32 v23, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v22, v19 :: v_dual_and_b32 v24, 0xffff0000, v2
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v25 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v23, v21 :: v_dual_lshlrev_b32 v28, 16, v24
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v22
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v23, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v24, v23 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v26
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v25, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v22, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v23 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v28, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v23
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v27 :: v_dual_and_b32 v25, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v29
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v29
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v21, v24, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v23 :: v_dual_lshlrev_b32 v29, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v24 :: v_dual_lshlrev_b32 v24, 16, v26
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v22 :: v_dual_lshlrev_b32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v27
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v26, v23 :: v_dual_lshlrev_b32 v23, 16, v25
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v27, v25 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v30, v29, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v26, v22 :: v_dual_and_b32 v24, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v23, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v29, v25, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v26, v25, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v24, v22 :: v_dual_lshlrev_b32 v23, 16, v25
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v25
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v26
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v27, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v26, v25 :: v_dual_lshlrev_b32 v24, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v27
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v24 :: v_dual_lshlrev_b32 v23, 16, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v15, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v15, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v14
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v27 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_lshlrev_b32 v26, 16, v5
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v14, v6 :: v_dual_lshlrev_b32 v24, 16, v27
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v26, v23, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v14, v6 :: v_dual_lshlrev_b32 v15, 16, v13
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v27, v7 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v14
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v5 :: v_dual_lshlrev_b32 v14, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v13, v5 :: v_dual_lshlrev_b32 v25, 16, v11
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_lshlrev_b32 v24, 16, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v12, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v4 :: v_dual_lshlrev_b32 v13, 16, v15
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v11, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v12, v4 :: v_dual_lshlrev_b32 v15, 16, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v18, v5, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_lshlrev_b32 v15, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v11, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v12, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v11
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v1 :: v_dual_lshlrev_b32 v14, 16, v10
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v10, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v12
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v0 :: v_dual_lshlrev_b32 v11, 16, v9
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v9, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v10, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v11, v2 :: v_dual_lshlrev_b32 v15, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v15
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v8, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v12, v1 :: v_dual_lshlrev_b32 v8, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v15, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v23, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v14, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v21, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v19, v4, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y)
@@ -9124,315 +8181,283 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v39
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v38, v35, vcc
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v32, v31, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v37, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v37
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v31
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
-; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v53
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v37, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v39, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v52, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v33, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v37
+; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v33, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v34, v33, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v35, v37
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v51, v50, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v50, v34, vcc
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
 ; GFX8-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
 ; GFX8-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
 ; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v27
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
 ; GFX8-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
 ; GFX8-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v35, v37, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v36
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
 ; GFX8-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v25
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v37
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
 ; GFX8-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v37, v39, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v24
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v38
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
 ; GFX8-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v38, v38, v48, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v23
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v39
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
 ; GFX8-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v51
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
 ; GFX8-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v22
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v51
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v48
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
 ; GFX8-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
 ; GFX8-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v50, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v21
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v49
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
 ; GFX8-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
 ; GFX8-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v51, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v20
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v50
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
 ; GFX8-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
 ; GFX8-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v50, v52, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v19
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v51
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
 ; GFX8-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v40
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
 ; GFX8-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v51, v51, v53, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v18
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v40
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v52
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
 ; GFX8-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
 ; GFX8-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v54, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
+; GFX8-NEXT:    v_and_b32_e32 v40, 0xffff0000, v17
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v53
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
+; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
 ; GFX8-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
 ; GFX8-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v53, v53, v40, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v40, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
+; GFX8-NEXT:    v_and_b32_e32 v41, 0xffff0000, v16
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
 ; GFX8-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v54
+; GFX8-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
+; GFX8-NEXT:    v_lshlrev_b32_e32 v43, 16, v40
 ; GFX8-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v40
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v42, v43
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v54, v54, v41, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -9442,12 +8467,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v40
 ; GFX8-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v55
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
@@ -9459,12 +8482,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v55
 ; GFX8-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
@@ -9476,12 +8497,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v55, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
@@ -9493,12 +8512,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v30, v29
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
@@ -9510,12 +8527,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v29, v28
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
@@ -9527,12 +8542,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v28, v27
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
@@ -9544,12 +8557,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v27, v26
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
@@ -9561,13 +8572,15 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v25
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -9578,16 +8591,11 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -9598,12 +8606,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v23
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
@@ -9615,12 +8621,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
@@ -9632,12 +8636,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v21
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
@@ -9649,12 +8651,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
@@ -9666,12 +8666,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v19
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
@@ -9683,12 +8681,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
@@ -9700,12 +8696,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -9733,11 +8727,11 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
 ; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -9747,315 +8741,284 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; GFX900-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX900-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v39
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v38, v35, vcc
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v31
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v32, v31, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v37, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v37
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v31
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX900-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX900-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v23
-; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v22
-; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
-; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
-; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
-; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX900-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
-; GFX900-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
-; GFX900-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v53
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v37, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v31
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v39, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX900-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v52, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v55
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v33, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v32
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX900-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX900-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v37
+; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v55
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v33
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v34, v33, vcc
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v35, v37
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX900-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v51, v50, vcc
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v50, v34, vcc
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
 ; GFX900-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v34
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
 ; GFX900-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
 ; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v27
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
 ; GFX900-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v35
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
 ; GFX900-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v35, v35, v37, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v36
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v36
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
 ; GFX900-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v37
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v37
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
 ; GFX900-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v37, v37, v39, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v38
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v38
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
 ; GFX900-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v38, v38, v48, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v39
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
 ; GFX900-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v39
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v51
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
 ; GFX900-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v51
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v48
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
 ; GFX900-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v48
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
 ; GFX900-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v5
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v50, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v49
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
 ; GFX900-NEXT:    v_cndmask_b32_e32 v51, v50, v49, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v49
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
 ; GFX900-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v51, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v50, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v50
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v50
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
 ; GFX900-NEXT:    v_cndmask_b32_e32 v52, v51, v50, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v50
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v51, v50, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
 ; GFX900-NEXT:    v_and_b32_e32 v51, 0xffff0000, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v50, v50, v52, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v52, 16, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v51
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v51
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
 ; GFX900-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v51
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v40
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v52, v51, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v52, 16, v51
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
 ; GFX900-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v51, v51, v53, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v18
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v40
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v52
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v52
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
 ; GFX900-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v52
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
 ; GFX900-NEXT:    v_and_b32_e32 v53, 0xffff0000, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v54, 16, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v54, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
+; GFX900-NEXT:    v_and_b32_e32 v40, 0xffff0000, v17
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
-; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v53
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v53
+; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
 ; GFX900-NEXT:    v_cndmask_b32_e32 v40, v54, v53, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v53
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
 ; GFX900-NEXT:    v_and_b32_e32 v54, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v53, v40, v53, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v53, v53, v40, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v40, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
+; GFX900-NEXT:    v_and_b32_e32 v41, 0xffff0000, v16
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
 ; GFX900-NEXT:    v_cndmask_b32_e32 v40, v40, v54, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
-; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v54
+; GFX900-NEXT:    v_lshlrev_b32_e32 v42, 16, v54
+; GFX900-NEXT:    v_lshlrev_b32_e32 v43, 16, v40
 ; GFX900-NEXT:    v_cndmask_b32_e32 v41, v40, v54, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v54
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v40
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v42, v43
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v54, v41, v54, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v54, v54, v41, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
@@ -10065,12 +9028,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v40
 ; GFX900-NEXT:    v_cndmask_b32_e32 v40, v55, v15, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v15
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v55
-; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v55, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v14
 ; GFX900-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
@@ -10082,12 +9043,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v55
 ; GFX900-NEXT:    v_cndmask_b32_e32 v55, v30, v14, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v14
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v30
-; GFX900-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
 ; GFX900-NEXT:    v_cndmask_b32_e32 v14, v55, v14, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
@@ -10099,12 +9058,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v55, v30
 ; GFX900-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v13
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v29
-; GFX900-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v55, 16, v30
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
@@ -10116,12 +9073,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v30, v29
 ; GFX900-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v12
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v28
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
@@ -10133,12 +9088,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v29, v28
 ; GFX900-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v11
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v27
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
@@ -10150,16 +9103,18 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v28, v27
 ; GFX900-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v10
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v26
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
 ; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX900-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc
@@ -10167,12 +9122,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v27, v26
 ; GFX900-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v9
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v25
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
@@ -10180,19 +9133,14 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc
-; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v25
 ; GFX900-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v24
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
@@ -10204,12 +9152,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX900-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v23
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
@@ -10221,12 +9167,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v23
 ; GFX900-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v22
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
@@ -10238,12 +9182,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v22
 ; GFX900-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v21
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
@@ -10255,12 +9197,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v21
 ; GFX900-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v20
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
@@ -10272,12 +9212,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v20
 ; GFX900-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v19
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
@@ -10289,12 +9227,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v19
 ; GFX900-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v18
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
@@ -10306,12 +9242,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v17
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
@@ -10323,14 +9257,11 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v17
 ; GFX900-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v16
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v54, v0, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v53, v1, s4
 ; GFX900-NEXT:    v_perm_b32 v2, v52, v2, s4
@@ -10344,484 +9275,430 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX900-NEXT:    v_perm_b32 v10, v36, v10, s4
 ; GFX900-NEXT:    v_perm_b32 v11, v35, v11, s4
 ; GFX900-NEXT:    v_perm_b32 v12, v34, v12, s4
-; GFX900-NEXT:    v_perm_b32 v13, v33, v13, s4
+; GFX900-NEXT:    v_perm_b32 v13, v32, v13, s4
 ; GFX900-NEXT:    v_perm_b32 v14, v31, v14, s4
-; GFX900-NEXT:    v_perm_b32 v15, v32, v15, s4
+; GFX900-NEXT:    v_perm_b32 v15, v33, v15, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimumnum_v32bf16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    scratch_load_dword v50, off, s32
+; GFX950-NEXT:    scratch_load_dword v51, off, s32
 ; GFX950-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v13
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v13
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v35, v32, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v29
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v39, v38, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v39
-; GFX950-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v24
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v34, v31, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v38, v35, vcc
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v31
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v37
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v32, v31, vcc
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v35
+; GFX950-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v37, v35, vcc
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v31
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v22
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
-; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v21
-; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v20
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v34
-; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v19
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v31, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v38
-; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v18
 ; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
-; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v26
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX950-NEXT:    v_and_b32_e32 v41, 0xffff0000, v17
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v54
 ; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
-; GFX950-NEXT:    v_and_b32_e32 v42, 0xffff0000, v16
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v37, v35, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v31
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v32
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v50
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v50
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v51
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v31, v39, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v51
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v32, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v33, v37
+; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v53, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v35, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v32
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
-; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX950-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
-; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v33, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v33
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v33, v36, v34, vcc
-; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
-; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v34, v33, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v35, v37
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v34
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v34
+; GFX950-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v52, v50, vcc
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v34
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v34, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v36, v37
-; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v50, v34, vcc
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v34
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v35, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v34
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v35
 ; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v11
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v27
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v36, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v27
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v37, v38
-; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v35
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v36, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v35
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v37
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v35
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v36
 ; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v35, v35, v37, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v37, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v36
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v36, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v38, v39
-; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v37, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v36
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v9
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v37, v36, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v37
 ; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v9
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v38, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v25
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v37, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v39, v48
-; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v37
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v38, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v37
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v38
 ; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v8
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v37, v37, v39, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v39, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v24
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v38, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v48, v49
-; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v38
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v39, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v38
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v48
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v39, v38, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v39
 ; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v38, v38, v48, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v48, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v39
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v23
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v39
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v39, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v49, v51
-; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v39
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v48
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v48, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v39
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v52
+; GFX950-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v48
 ; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v6
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v49, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v48
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v22
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v48
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v48, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v52
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v48
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v49
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v49, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v48
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v51
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v49, v48, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v49
 ; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v5
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v48, v51, v48, vcc
-; GFX950-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
+; GFX950-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v51, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v49
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v50, vcc
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v21
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v49
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v49, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v49
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v50
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v50, v49, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v51, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v49
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v51, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v52
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
-; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v4
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v49, v49, v52, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v52, 16, v20
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v52, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v51
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v53, v52, vcc
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v20
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v50
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v51, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v50, vcc
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v52, v50, vcc
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v55
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v52, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v51
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v52, v50, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v52
 ; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v3
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v51, v53, v51, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v53, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v53, 16, v19
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v53, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v19
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v52, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v54, v55
-; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v52
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v53, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v52
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v54
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v55, v40
+; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v53, v52, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v53
 ; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v2
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v53
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v18
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v53
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v53, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v55, v40
-; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v53
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v54, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v53
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v54, v53, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v54, 16, v53
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v54
 ; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v53, v53, v55, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v55, 16, v17
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v55, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v54
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v17
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v54
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v54, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v41
-; GFX950-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v54
+; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v55
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v40, v55, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v54
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX950-NEXT:    v_lshrrev_b32_e32 v41, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v55, v54, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v55
 ; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v54, v54, v40, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v40, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v42, v42
-; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v55
+; GFX950-NEXT:    v_and_b32_e32 v41, 0xffff0000, v16
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v41, v41
+; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v55
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v40, v40, v55, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v42
-; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v55
+; GFX950-NEXT:    v_lshlrev_b32_e32 v43, 16, v40
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v41, v40, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v55
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v40
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v40, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v41
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v42, v43
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v40, v55, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v55
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v55, v55, v41, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v51
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v51, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v40, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v50, v50, v15, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v50
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v51, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v51
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v41, v40
-; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v40, v50, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v50
+; GFX950-NEXT:    v_cndmask_b32_e32 v40, v51, v15, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v40
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v15
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v15, v50, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v40
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v50
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v51, v15, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v41
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v14
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v40, v15, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT:    v_perm_b32 v15, v33, v15, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v30, v30, v14, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v30
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v50
-; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v50, v30, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v40, v51
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v51, v30, v14, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v51
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v14
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v50
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v40
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v13
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v50, v14, vcc
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v51, v14, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v14, v31, v14, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; GFX950-NEXT:    v_lshlrev_b32_e32 v50, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v13
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v50, v30
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v51, v30
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v13
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v51
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v13, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v13, v32, v13, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
@@ -10831,20 +9708,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v30, v29
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v29, v28, v12, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v12
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v30
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v12, v34, v12, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
@@ -10854,20 +9728,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v29, v28
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v11
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v29
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v10
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v11, v35, v11, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
@@ -10877,20 +9748,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v28, v27
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v27, v26, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v10
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v26
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
-; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v10, v36, v10, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
@@ -10900,20 +9768,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v27, v26
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v26, v25, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v27
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v9, v37, v9, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
@@ -10923,20 +9788,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v26, v25
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v8
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v26
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v8, v38, v8, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
@@ -10946,20 +9808,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v25, v24
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v7
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v25
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v7, v39, v7, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
@@ -10969,20 +9828,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v24, v23
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v24
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v6, v48, v6, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
@@ -10992,20 +9848,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v23, v22
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v5, v49, v5, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
@@ -11015,20 +9868,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v22, v21
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v20, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v4, v50, v4, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
@@ -11038,20 +9888,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v21, v20
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v3, v52, v3, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
@@ -11061,20 +9908,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v20, v19
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v2
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v2, v53, v2, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
@@ -11084,20 +9928,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v19, v18
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v1, v54, v1, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
@@ -11107,630 +9948,549 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v18, v17
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v16, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v18
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v16
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v54, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
-; GFX950-NEXT:    v_perm_b32 v2, v53, v2, s0
-; GFX950-NEXT:    v_perm_b32 v3, v52, v3, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v55, v0, s0
-; GFX950-NEXT:    v_perm_b32 v4, v51, v4, s0
-; GFX950-NEXT:    v_perm_b32 v5, v49, v5, s0
-; GFX950-NEXT:    v_perm_b32 v6, v48, v6, s0
-; GFX950-NEXT:    v_perm_b32 v7, v39, v7, s0
-; GFX950-NEXT:    v_perm_b32 v8, v38, v8, s0
-; GFX950-NEXT:    v_perm_b32 v9, v37, v9, s0
-; GFX950-NEXT:    v_perm_b32 v10, v36, v10, s0
-; GFX950-NEXT:    v_perm_b32 v11, v35, v11, s0
-; GFX950-NEXT:    v_perm_b32 v12, v34, v12, s0
-; GFX950-NEXT:    v_perm_b32 v13, v33, v13, s0
-; GFX950-NEXT:    v_perm_b32 v14, v31, v14, s0
-; GFX950-NEXT:    v_perm_b32 v15, v32, v15, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v33, v32, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v38, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v32, v31, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v27
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v32
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v35, v34, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v12
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v48, v39, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v23
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v35, v32, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v33
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v70, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v38, v34, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v37
-; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; GFX10-NEXT:    v_cndmask_b32_e32 v35, v39, v33, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v34
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v31, v38
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v26
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v53, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v35
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v39, v48
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v25
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v49, v50
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v52, v38, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v39, v39
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v50, v49, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v49, v39, s6
-; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v52, v52
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v54, v53, s6
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v51, v55
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v49
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v49, s7
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v55, v55
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s8, v31, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v65, v64, s7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v34, v33, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v35, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v33
+; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v36, v37, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v38, v38
+; GFX10-NEXT:    v_and_b32_e32 v70, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v81, 0xffff0000, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v49, v48, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v50, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v34
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v70, v70
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v81, v81
+; GFX10-NEXT:    v_lshrrev_b32_e32 v85, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v37, v36, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v51, v51
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v10
+; GFX10-NEXT:    v_cmp_ne_u16_e64 s20, 0, v31
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v48, v35, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, v39, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v37
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v51, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v9
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s6, v39, v48
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, v49, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v53, v52, s7
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v25
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v48, v48
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v52, v39, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v49, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v51, v50, s7
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v64, v55, s7
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v65, v65
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v50, v49, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v55, v54, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v53, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v48
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s8, v64, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v54, v51, s7
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v7
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s9, v52, v55
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v22
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v67, v66, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v52
-; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v64, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v65
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s9, v54, v67
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v66, v65, s7
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v54, v54
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v67, v66, s7
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, v68, v69
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v54, v54
-; GFX10-NEXT:    v_lshrrev_b32_e32 v69, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v67, v66, s10
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v68, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v70, v69, s10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v67, v67
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v66, v54, s10
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v71, v71
-; GFX10-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v69, v68, s10
-; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v3
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s11, v70, v81
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v67
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v66, v54, s10
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v65, v64, s10
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v66, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v68, v67, s10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v65, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v71, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v69, v69
-; GFX10-NEXT:    v_and_b32_e32 v70, 0xffff0000, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s12, v82, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v71, s10
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s10, v31, v51
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v64
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v67, v66, s10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v52
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s11, v69, v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v65
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s10, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v83, v82, s13
+; GFX10-NEXT:    v_and_b32_e32 v69, 0xffff0000, v3
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s12, v71, v80
+; GFX10-NEXT:    v_lshrrev_b32_e32 v70, 16, v19
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v68, v68
+; GFX10-NEXT:    v_lshrrev_b32_e32 v71, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v82, v67, s13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v69, v69
+; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v68
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v71, v70, s13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v80, v80
+; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v80, 16, v18
-; GFX10-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v69
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v71, v69, s13
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v70, v70
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v81, v80, s13
-; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v82, v82
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v80, v70, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v70, v70, v69, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v71, v71
-; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v82, v81, s13
+; GFX10-NEXT:    v_and_b32_e32 v71, 0xffff0000, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, v82, v80, s13
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v71, v71
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s13, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v70
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v71, v81, v82, s14
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s14, v31, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v82
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s13, v81, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v69
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v70
+; GFX10-NEXT:    v_cndmask_b32_e64 v71, v80, v82, s14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v82
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s14, v81, v83
+; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v71
-; GFX10-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s15, v31, v81
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v16
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s15, v80, v81
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v81, 16, v17
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v80, v80
+; GFX10-NEXT:    v_and_b32_e32 v80, 0xffff0000, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v83, v83, v81, s16
-; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v31, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v81, v83, s16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v81
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s16, v31, v84
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v31, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v84, s17
-; GFX10-NEXT:    v_and_b32_e32 v85, 0xffff0000, v30
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v80, v80
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v81, v83, s16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v80
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s16, v81, v84
+; GFX10-NEXT:    v_and_b32_e32 v81, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v16
+; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v85, v84, s17
+; GFX10-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s17, v85, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v84, v31, s17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v81
+; GFX10-NEXT:    v_cndmask_b32_e64 v84, v84, v81, s17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v84
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s17, v85, v86
-; GFX10-NEXT:    v_lshrrev_b32_e32 v86, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v84, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v31
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v31, v84, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v85
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s17, 0, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v84, v37, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v31, v85, v31, s17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v84, v32, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v37
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v32, v37, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v36, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v34
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v39
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v37, v34, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v36
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v34, v36, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v36, v35, v33, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v38
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v33, v35, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v36
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v48, v38, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v35, v38, s4
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v38, v48, s4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v35
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v50, v39, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v65
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v48, v39, s5
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v39, v50, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v48
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v50
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v52, v49, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v50, v49, s6
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v49, v52, s6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v50
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s6, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v53, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v52, v55, s7
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v55, v53, s7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v52
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s7, 0, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v64, v65, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v69
-; GFX10-NEXT:    v_cndmask_b32_e64 v36, v52, v53, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v55, v65, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v64
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s8
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v66, v54, s11
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v54
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v65, v54, s8
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v54, v54, v66, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v65
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s8, 0, v66
-; GFX10-NEXT:    v_cndmask_b32_e64 v66, v67, v68, s12
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v66, v68, s9
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v67
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v68, v67, s9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v66
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s9, 0, v68
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v51, v69, s13
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v68, v69, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v51
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v69, v51, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v70, s14
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v70
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v69, v70, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v70, v70, v80, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v80, v71, v82, s15
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v80, v82, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v71
-; GFX10-NEXT:    v_cndmask_b32_e64 v71, v82, v71, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v82, v81, v83, s16
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v82
-; GFX10-NEXT:    v_cndmask_b32_e64 v83, v82, v83, s10
-; GFX10-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v81
-; GFX10-NEXT:    v_cndmask_b32_e64 v81, v83, v81, s10
-; GFX10-NEXT:    buffer_load_dword v83, off, s[0:3], s32
-; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v85, v85
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v85, v85
-; GFX10-NEXT:    v_cndmask_b32_e64 v85, v14, v30, s11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v85, v85
+; GFX10-NEXT:    v_cndmask_b32_e64 v85, v14, v30, s18
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v30
-; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v14, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v84
-; GFX10-NEXT:    v_cndmask_b32_e64 v87, v30, v85, s11
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v35, v38, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, v50, v49, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v38, v65, v54, s8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v80
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v84, v32, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v37
-; GFX10-NEXT:    v_and_b32_e32 v84, 0xffff0000, v15
-; GFX10-NEXT:    v_cmp_eq_f32_e64 s12, 0, v32
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v34, s12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v48, v39, s5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v68
+; GFX10-NEXT:    v_lshlrev_b32_e32 v86, 16, v85
+; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v30, v85, s18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v30
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s18, v86, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v86, v13, v29, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v86
+; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v13, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v87, v29, v86, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v87
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s19, v14, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v32, v31, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, v87, v86, s19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v13
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v13, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v34, v33, s4
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v33
+; GFX10-NEXT:    v_cmp_ne_u16_e64 s4, 0, v85
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v29, v34, v33, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v48, v39, s9
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v38, v36, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v38, v36, vcc_lo
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v29
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v50, v49, s8
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v35, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v31, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v35
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v37, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v49
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v53, v51, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v50, v49, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v52, v54, s10
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v53, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v54
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v64, v55, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v65, v66, s12
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v52, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v55
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v64, v55, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v68, v67, s13
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v65, v66, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v70, v69, s14
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v68, v67, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v69
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v71, v82, s15
+; GFX10-NEXT:    v_cndmask_b32_e32 v64, v70, v69, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v82
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v80, v83, s16
+; GFX10-NEXT:    v_cndmask_b32_e32 v66, v71, v82, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v83
+; GFX10-NEXT:    v_cndmask_b32_e64 v71, v84, v81, s17
+; GFX10-NEXT:    v_cndmask_b32_e32 v68, v80, v83, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v81
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v83, 0xffff0000, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v70, v84, v81, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX10-NEXT:    v_cndmask_b32_e64 v80, v30, v85, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v81, v30, v85, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v85, 16, v11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v30, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v28
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, v12, v28, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v32
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v30, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v34
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v96, v28, v82, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v32, v31, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v34, v33, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v49
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v51
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v33
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v36, v35, s5
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s5, 0, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v49, v48, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v34
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v39, v37, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v65
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v67
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v69
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v55, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v51, v50, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v71
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v80
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v53, v52, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v55, v54, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, 0, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v65, v64, s4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v83, v83
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v83
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v15, v83, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v66, v67, s9
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v86, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v50, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v83, v64, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v15, v38, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v39, v68, v51, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v67, v66, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v84, v37, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v38, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v37, v54, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX10-NEXT:    v_cndmask_b32_e32 v48, v69, v70, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v53, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v66
-; GFX10-NEXT:    v_cndmask_b32_e32 v65, v55, v64, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v69, v68, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
-; GFX10-NEXT:    v_cndmask_b32_e32 v50, v80, v71, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v65, v64, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v51
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v49, v53, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v65
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v49, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
-; GFX10-NEXT:    v_cndmask_b32_e32 v52, v82, v81, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v71, v70, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v52
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v38, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v64
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v39, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v38, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v39, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v97
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v80, v81, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v51, v65, v54, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v85
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v87, v85, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v85
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v53, v85, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v55, v87, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v55, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v55
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v53, v28, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v65
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v54, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v27
+; GFX10-NEXT:    v_perm_b32 v14, v14, v38, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v49, v52, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v82
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v51, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v96
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v54, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX10-NEXT:    v_perm_b32 v13, v14, v13, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v53, v12, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v11
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v9
-; GFX10-NEXT:    v_perm_b32 v14, v31, v28, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v32, v12, 0x5040100
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v86
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v87, v86, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v52, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v96, v82, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v97, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v26
+; GFX10-NEXT:    v_perm_b32 v13, v13, v27, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v82
+; GFX10-NEXT:    v_cndmask_b32_e32 v64, v96, v82, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v52, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v26, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v54
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v51, v64, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v25
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v52, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v64, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v53, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v54, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v53, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v29, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
-; GFX10-NEXT:    v_perm_b32 v11, v33, v11, 0x5040100
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v24
+; GFX10-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_perm_b32 v12, v29, v26, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v53, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v51, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v23
 ; GFX10-NEXT:    v_perm_b32 v10, v30, v10, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v24
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
 ; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_perm_b32 v9, v34, v9, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v53, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v22
+; GFX10-NEXT:    v_perm_b32 v9, v31, v9, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_perm_b32 v8, v35, v8, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX10-NEXT:    v_perm_b32 v7, v36, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v25, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX10-NEXT:    v_perm_b32 v8, v28, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v52, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v50
+; GFX10-NEXT:    v_perm_b32 v7, v32, v7, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v3
+; GFX10-NEXT:    v_perm_b32 v6, v33, v6, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX10-NEXT:    v_perm_b32 v5, v38, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX10-NEXT:    v_perm_b32 v5, v34, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX10-NEXT:    v_perm_b32 v3, v39, v3, 0x5040100
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX10-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX10-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v37, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v20, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX10-NEXT:    v_perm_b32 v2, v48, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v49, v51, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v48, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT:    v_perm_b32 v2, v15, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v15, v49, v39, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v35, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16:
@@ -11740,1268 +10500,1139 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s40, v86, v118
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v32.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v128
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v33.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v55.l, v32.l, s1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v129
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v86, v128
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v130
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v35.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v64.l, v33.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v65.l, v34.l, s3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v87, v129
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v131
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v96, v128
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s44, v99, v99
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s45, v100, v100
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v66.l, v35.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v68.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v97, v129
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v65.l, v34.l, s3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s43, v98, v98
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v133
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v64.l, v33.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v66.l, v35.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s7, 0, v38.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v68.l, v37.l, s6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v132
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v99, v129
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v55.l, v32.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s46, v101, v101
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v48.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v67.l, v36.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v69.l, v38.l, s7
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v98, v128
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v86, v100
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v68.l, v37.l, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v101
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v103
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v71.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v67.l, v36.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v69.l, v38.l, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v64
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v65
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s8, 0, v39.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s9, 0, v48.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v50.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v80.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s10, 0, v49.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v51.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v53.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v70.l, v39.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v71.l, v48.l, s9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v113
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v81.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v87, v101
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v84.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v96, v103
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v67
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v69
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s11, 0, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v52.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s14, 0, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v54.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v80.l, v49.l, s10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v114
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v116
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v85.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v97, v112
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s63, v116, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v71.l, v48.l, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s43
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s12, 0, v51.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s13, 0, v52.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s15, 0, v54.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v81.l, v50.l, s11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v115
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v84.l, v53.l, s14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v117
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s42, v87, v118
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s43, v96, v119
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s45, v98, v129
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s56, v101, v132
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s59, v112, v135
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s60, v113, v144
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v39.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v51.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s46, v99, v130
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s61, v114, v145
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s62, v115, v146
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v33.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v52.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v70.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v81.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v97, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v35.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s57, v102, v133
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v65.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v66.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s47, v100, v131
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s58, v103, v134
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v38.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v49.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v64.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v71.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v80.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v85.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v67.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v84.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v82.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v83.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v69.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v98, v113
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s14, v87, v101
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v80.l, v49.l, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.l, v48.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v11.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s44
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v82.l, v51.l, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v83.l, v52.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.h, v85.l, v54.l, s15
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v99, v114
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v86, v100
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s15, v96, v103
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v70.l, v39.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v81.l, v50.l, s11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v84.l, v53.l, s14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v80
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v12.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v29.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v82.l, v51.l, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v83.l, v52.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v85.l, v54.l, s15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.l, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v81
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v80
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v36.l, v30.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v119
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.l, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v71
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v81
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v29.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s46
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v118
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s17, v116, v115
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v70
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v83
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v84
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v71
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v85
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v28.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v37.l, v32.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v49.l, v37.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s16, v112, v97
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v70
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v82
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v83
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v84
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v39.l, v35.h, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v28.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.l, v14.l, s16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v38.l, v33.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v50.l, v38.h, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v54.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v86
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v54.l, v50.h, s15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v31
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v50.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v52, v53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v15.h, v31.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v66, v66
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v64.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v38, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v31.h, v64.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v48.l, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v31.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v67, v69
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v66, v68
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v65.l, v64.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v15.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v64.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v31.l, v15.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v65.l, v64.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v29.l, v13.l, s17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v51.l, v39.h, s12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v31.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v37
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v53.l, v49.h, s14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v52.l, v48.h, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v102, v102
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v52, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v30.l, v14.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v13.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v48, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v29.l, v13.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v28.l, v12.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v55.l, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v49, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.l, v26.l, v10.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v27.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v31.l, v10.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v26.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v28.l, v12.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v11.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v49, v48
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v25.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.l, v11.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v26.l, v10.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v29.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v30.l, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v49, v48
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v24.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v25.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v24.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v29.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v27, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v26.l, v10.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v49, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v23.l, v7.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v25.l, v9.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v24.l, v8.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v25, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v27.l, v7.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v22.l, v6.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v10.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v24.l, v8.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v21.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v39
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v10.l, v7.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v27, v26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v6.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v9.l, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v23.l, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v20.l, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v8.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v6.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v8.l, v5.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v7.l, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v23, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v18.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v17.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v16.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.l, v4.l, s3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v6.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v7.l, v2.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v18
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v17.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v8.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v10.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v10.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v1.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v2.l, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v19.l, v0.l, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v18.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v17.l, v1.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v16.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v5.l, v2.l, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v38 :: v_dual_mov_b32 v1, v37
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v33 :: v_dual_mov_b32 v5, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v31 :: v_dual_mov_b32 v7, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v29 :: v_dual_mov_b32 v9, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v10, v34
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v32bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v116, v115, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v128, v119, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v132, v131, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v144, v135, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v39
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v80
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v55, v64, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v51
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v54
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v96
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v114
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v116
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v69, v71, v80, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v71, v83, v84, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v83, v87, v96 :: v_dual_and_b32 v98, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s7, 0, v118
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v99, v100, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v97, v103, v112, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v112
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v101, v115, v114, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v116
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v119, v116, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v118, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v129, v135, v128, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v118
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v147, v34, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v49, v130
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v37, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v131
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v132
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v69, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v81, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v85, v135
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v97, v144
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v101, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v112, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v114, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v116, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v118, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v128, v49
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v30, v14 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v68
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v84
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v133, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v52
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v135, v51, v52 :: v_dual_lshlrev_b32 v50, 16, v15
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v64
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v145, v54, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v68
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v147, v65, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v37, v132
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v132, v69, v80, s0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v49, v134
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v65
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v134, v71, v84, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e64 vcc_lo, 0, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v114
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v53, v144
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v144, v83, v96, s2
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v55, v146
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v129, v128, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v146, v86, v100, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v80
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v67, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v54, v64, s2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v38, v34 :: v_dual_lshlrev_b32 v53, 16, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v86
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v97, v112, s4
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s4, v70, v49
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v101, v114, s5
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s5, v81, v53
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v65, v68, s3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v100
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v97
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v71, v84, s5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v112
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v84, v30, v14 :: v_dual_lshlrev_b32 v85, 16, v96
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v87, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v113, v116, s6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v86, v100 :: v_dual_lshlrev_b32 v36, 16, v117
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v99, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e64 s6, v85, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v113
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v97, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v103, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v128
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v86
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v96
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v98
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v102
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v115
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v119
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v83, v96, s6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v101, v114, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v115, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v69, v80, s4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v113, v116 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v119, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v69
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v117, v118 :: v_dual_lshlrev_b32 v87, 16, v71
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v130, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v53
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v52, v129, v128 :: v_dual_lshlrev_b32 v101, 16, v55
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v131, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v117, v118, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v80
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v82, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v83
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v35, v35, v98 :: v_dual_lshlrev_b32 v68, 16, v51
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v39, v133, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v135, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v80
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v54, v145, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v82
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v65, v147, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v86
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v69, v132, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v71, v134 :: v_dual_and_b32 v69, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v96
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v83, v144, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v97
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v146, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v99
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v53, v70, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v101
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v55, v81, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v67
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v64, v53
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v103
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v36, v85, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v67, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v112
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v52, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v31
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v113
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v67, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v36, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v32
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v36, v32 :: v_dual_lshlrev_b32 v31, 16, v50
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v84, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v31
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v50, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v52, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v64
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v31, v15, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v28
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v29, v28, v12 :: v_dual_lshlrev_b32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v30, v13 :: v_dual_lshlrev_b32 v32, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v30
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v10 :: v_dual_lshlrev_b32 v30, 16, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v51, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v29, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v32, v29
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v9 :: v_dual_lshlrev_b32 v29, 16, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v30, v28
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v24, v8 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v65, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v25, v8 :: v_dual_lshlrev_b32 v27, 16, v22
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v66, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v27
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v22, v6 :: v_dual_lshlrev_b32 v27, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v68, v7, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v20 :: v_dual_lshlrev_b32 v25, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v22
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v21, v5 :: v_dual_lshlrev_b32 v25, 16, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v4 :: v_dual_lshlrev_b32 v23, 16, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v24, 16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v20, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v22
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v18, v2 :: v_dual_lshlrev_b32 v25, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v55, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_lshlrev_b32 v23, 16, v24
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v48, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v20, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v53, v4, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v32bf16:
@@ -13015,747 +11646,660 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s40, v86, v118
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v55.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v32.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v64.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v65.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v128
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v33.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v34.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v55.l, v32.l, s1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s63, v116, v86
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v55.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s42, v87, v118
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s43, v96, v119
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s45, v98, v129
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s56, v101, v132
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s59, v112, v135
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s60, v113, v144
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v66.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v129
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v86, v128
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v130
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v29
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v28
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v35.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v39.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v50.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v51.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s46, v99, v130
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s61, v114, v145
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s62, v115, v146
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v33.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v52.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v53.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v64.l, v33.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v65.l, v34.l, s3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v70.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v81.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v87, v129
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v131
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v96, v128
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s44, v99, v99
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s45, v100, v100
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v54.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v97, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v35.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s57, v102, v133
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v65.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v66.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s47, v100, v131
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s58, v103, v134
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v38.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v49.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v64.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v71.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v66.l, v35.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v68.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v97, v129
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v80.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v85.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v65.l, v34.l, s3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s43, v98, v98
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v37.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v67.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v100
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v69.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v133
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v64.l, v33.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v66.l, v35.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v34.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s7, 0, v38.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v67.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v68.l, v37.l, s6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v132
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v99, v129
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v55.l, v32.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v33.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s46, v101, v101
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v48.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v67.l, v36.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v69.l, v38.l, s7
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v98, v128
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v86, v100
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v68.l, v37.l, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v32.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v66
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v49.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v101
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v70.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v103
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v71.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v67.l, v36.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v69.l, v38.l, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v35.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v37.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v64
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v65
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v84.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v82.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s8, 0, v39.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s9, 0, v48.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v50.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v112
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v80.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v36.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v64
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s10, 0, v49.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v51.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v53.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v83.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v69.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v70.l, v39.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v71.l, v48.l, s9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v113
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v81.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v87, v101
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v84.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v96, v103
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v67
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v69
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s11, 0, v50.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v52.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s14, 0, v53.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v54.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v80.l, v49.l, s10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v114
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v82.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v83.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v116
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v85.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v97, v112
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v71.l, v48.l, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v68
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s43
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s12, 0, v51.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s13, 0, v52.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s15, 0, v54.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v81.l, v50.l, s11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v115
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v84.l, v53.l, s14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v117
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v98, v113
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s14, v87, v101
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v80.l, v49.l, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v80.l, v48.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v11.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s44
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v82.l, v51.l, s12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v83.l, v52.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.h, v85.l, v54.l, s15
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v99, v114
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v86, v100
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s15, v96, v103
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v70.l, v39.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v81.l, v50.l, s11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v84.l, v53.l, s14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v49.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v80
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v12.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v13.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v29.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v82.l, v51.l, s12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v83.l, v52.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v85.l, v54.l, s15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v39.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v85.l, v53.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v81
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v80
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v36.l, v30.h, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s45
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v14.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v30.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v119
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v70.l, v38.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v50.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v51.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v84.l, v52.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v71
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v81
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v29.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s46
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v118
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s17, v116, v115
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v70
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v83
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v84
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v71
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v85
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v28.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v37.l, v32.h, s6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v49.l, v37.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s16, v112, v97
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v70
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v82
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v83
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v84
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v39.l, v35.h, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v28.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.l, v14.l, s16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v38.l, v33.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v50.l, v38.h, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v54.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v55.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v86
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v54.l, v50.h, s15
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v31
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v15.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v15.h, v31.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v66, v66
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v15.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v64.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v50.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v52, v53
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v51.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v38, v53
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v31.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v52, v51
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v31.h, v64.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v66
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v48.l, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v31.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v65.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v69
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v68
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v67, v69
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v66, v68
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v65.l, v64.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v15.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v64.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v33.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v32.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v31.l, v15.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v65.l, v64.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v29.l, v13.l, s17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v35
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v51.l, v39.h, s12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v31.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v37
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v53.l, v49.h, s14
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v52.l, v48.h, s13
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v102, v102
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v14.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v49, v49
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v30.l, v14.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v13.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v48, v39
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v49
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v29.l, v13.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v28.l, v12.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v27.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v29.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v55.l, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v49, v48
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.l, v26.l, v10.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v27.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v31.l, v10.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v26.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v28.l, v12.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v11.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v49, v48
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v25.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v27.l, v11.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v26.l, v10.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v25.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v27.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v29.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v30.l, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v49, v48
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v24.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v8.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v25.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v24.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v51, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v29.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v28.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v51, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v27, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v26.l, v10.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v49, v48
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v50
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v9
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v23.l, v7.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v25.l, v9.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v24.l, v8.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v9.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v25, v24
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v27.l, v7.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v23
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v22.l, v6.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v10.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v24.l, v8.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v21.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v10.l, v7.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v27, v26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v22.l, v6.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v9.l, v5.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v23.l, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v20.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v8.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v6.l, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v19.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v8.l, v5.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v7.l, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v23, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v18.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v17.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v20
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v16.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.l, v4.l, s3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v6.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v7.l, v2.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v20, v18
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v17.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v8.l, v1.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v10.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v18.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v3.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v21
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v10.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v1.l, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v2.l, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v6.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v19.l, v0.l, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v18.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v17.l, v1.l, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v16.l, v1.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v5.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v38 :: v_dual_mov_b32 v1, v37
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v35
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v33 :: v_dual_mov_b32 v5, v32
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v31 :: v_dual_mov_b32 v7, v30
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v29 :: v_dual_mov_b32 v9, v28
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v10, v34
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v32bf16:
@@ -13765,792 +12309,683 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v30
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v30
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v13
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v20
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v19
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v35, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v29
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v3
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v80, v71, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v38, v38
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v84, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v116, v115, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v12
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v28
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v18
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v48, v48, v39, s0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v96, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v128, v119, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v26
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v1
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v52, v52, v51, s1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v100, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v132, v131, vcc_lo
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v8
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v25
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v9
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v25
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v8
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v16
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s2, v54, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v112, v103, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v114, v114
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v116, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v118, v118
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
-; GFX12-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v128, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v100, v132, v131, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s5, v82, v82
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v102, v144, v135 :: v_dual_and_b32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v144, v135, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v96
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v70, v70
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v13
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v84, v84, v83, s5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v147, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v27
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s6, v86, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v14, v30 :: v_dual_and_b32 v97, 0xffff0000, v23
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v64, v64, v55, s2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v23
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v80, v80, v71, s4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v22
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v96, v96, v87, s6
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v100
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v39
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v80
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v55, v64 :: v_dual_lshlrev_b32 v130, 16, v51
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v55, v64, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v34
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v51
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v84
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v54
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s2, 0, v96
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s3, v66, v66
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s5, 0, v114
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s6, 0, v116
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v68, v68, v67, s3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v68
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v80
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v67
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v69, v71, v80, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v83, v80, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v71, v83, v84, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v84
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v87, v87, v82 :: v_dual_lshlrev_b32 v134, 16, v83
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v83, v87, v96 :: v_dual_and_b32 v98, 0xffff0000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s7, v98, v98
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v101, v101
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v16
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v28
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v100, v100, v99, s7
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s7, 0, v118
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v99, v99, v84, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v99, v100, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v113, v113
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s3, 0, v100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s8, v102, v102
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v112, v112, v103, s8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v103, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v99
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v97, v103, v112, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v117, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 s4, 0, v112
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v115, v96, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v101, v115, v114, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v82
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v116
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v115, v119, v98 :: v_dual_lshlrev_b32 v146, 16, v113
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v113, v119, v116, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v100, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v117, v131, v118, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v86
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v119, v135, v102, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v129, v135, v128, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v118
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v147, v34 :: v_dual_lshlrev_b32 v49, 16, v52
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v147, v34, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v49, v130
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v30, v54 :: v_dual_lshlrev_b32 v53, 16, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v30
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v70
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v117
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v130, v35, v36 :: v_dual_lshlrev_b32 v129, 16, v39
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v37, v129
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v129, v51, v52, s0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v39, v48 :: v_dual_lshlrev_b32 v118, 16, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v55
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v53, v131
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v55, v64 :: v_dual_lshlrev_b32 v50, 16, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v71
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v132
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v67, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v69, v133
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v69, v71, v70 :: v_dual_lshlrev_b32 v132, 16, v65
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v81, v134
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v83, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v85, v135
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v87, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v97, v144
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v97, v99, v84 :: v_dual_lshlrev_b32 v114, 16, v98
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v101, v145
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v101, v103, v86 :: v_dual_lshlrev_b32 v144, 16, v97
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v112, v146
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v30, v14 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v48
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v113, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v114, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v119
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v38
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v84
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v114, v115, v98, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v116, v14
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v36
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v116, v117, v100, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v118, v30
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v35, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v48
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v118, v119, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v128, v49
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v133, v39, v48, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v128, v38, v34, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v135, v51, v52 :: v_dual_lshlrev_b32 v50, 16, v15
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v64
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v145, v54, v64, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v37, v48, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v147, v65, v68, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v37, v132
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v132, v69, v80, s0
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v49, v134
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v65
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v134, v71, v84, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v129, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e64 vcc_lo, 0, v128
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v114
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v53, v144
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v144, v83, v96, s2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v55, v146
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v39, v39, v48, s0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v129, v128, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v34
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v69
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v146, v86, v100, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v80
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s3, v67, v37
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v54, v54, v64, s2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v38, v34 :: v_dual_lshlrev_b32 v53, 16, v71
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v86
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v67, v97, v112, s4
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s4, v70, v49
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v70, v101, v114, s5
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s5, v81, v53
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v65, v65, v68, s3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v100
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v97
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v71, v71, v84, s5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v112
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v84, v30, v14 :: v_dual_lshlrev_b32 v85, 16, v96
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v87, v37
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v83
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v101
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v81, v113, v116, s6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v86, v100 :: v_dual_lshlrev_b32 v36, 16, v117
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v99, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e64 s6, v85, v55
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v113
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v51, v51, v52, s1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v97, v112, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v103, v53
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v128
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v129
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v38
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v83, v83, v96, s6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v101, v114, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v115, v55
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v14
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v34
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v69, v69, v80, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v113, v116 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v119, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v69
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v117, v118 :: v_dual_lshlrev_b32 v87, 16, v71
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v130, v52
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v37
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v49
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v53
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v52, v129, v128 :: v_dual_lshlrev_b32 v101, 16, v55
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v131, v68
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v36
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v85, v117, v118, s7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v52
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v80
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v35
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v39
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v54
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v34
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v82, v96
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v65
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v83
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v14
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v35, v35, v98 :: v_dual_lshlrev_b32 v68, 16, v51
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v66
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v39, v133, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v53, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v65, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v69, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v81, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v97, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v86
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v101, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v96
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v112, v96, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v98
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v98, v114, v98 :: v_dual_lshlrev_b32 v131, 16, v53
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v100, v116, v100 :: v_dual_lshlrev_b32 v133, 16, v69
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v35 :: v_dual_lshlrev_b32 v135, 16, v85
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v102
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v118, v102, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v36, v36, v39 :: v_dual_lshlrev_b32 v145, 16, v101
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v128, v34, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v49, v51, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v128
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v64, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v68, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v64, v70, v71 :: v_dual_lshlrev_b32 v147, 16, v114
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v80, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v82, v87, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v84, v99, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v86, v103 :: v_dual_lshlrev_b32 v30, 16, v130
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v37
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v96, v113, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v115
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v115, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v117
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v81
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v83, v100, v117, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v119
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v35, v119, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v135, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v80
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v54, v145, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v82
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v130, v14, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v65, v147, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v86
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v37, v36, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v69, v132, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v87
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v129, v39, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v131
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v66, v71, v134 :: v_dual_and_b32 v69, 0xffff0000, v31
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v96
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v53, v49, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v132
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v31
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v83, v144, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v97
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v133
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v146, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v99
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v69, v64, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v134
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v67, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v81, v67, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v135
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v53, v70, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v101
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v85, v68, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v144
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v97, v70, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v145
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v101, v71, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v55, v81, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v15, v31, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v148, 16, v116
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v112
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v146
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v112, v80, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v33, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v118
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v31, v55 :: v_dual_lshlrev_b32 v64, 16, v52
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v147
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v114, v82, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v148
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v116, v83, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v52, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v67
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v53, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v102
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v65
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v118, v84, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v33, v67, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v103
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v65, v55, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v52, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v55, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v67
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v13
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v64, v33, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v128, v86, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v68
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v36, v85, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v65, v53, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v66
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v29 :: v_dual_lshlrev_b32 v64, 16, v54
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v64, v53
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v67, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v112
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v32
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v36
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v66, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v13
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v52, v48, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v31
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v113
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v64, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v53, v54, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v67, v69
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v36, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v15
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v66, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v65, v64
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v53
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v32
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v29, v13 :: v_dual_lshlrev_b32 v66, 16, v12
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v36, v32 :: v_dual_lshlrev_b32 v31, 16, v50
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v84, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v31
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v53, v54 :: v_dual_lshlrev_b32 v64, 16, v55
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v66, v65
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v14, v53, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v28, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
+; GFX12-FAKE16-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v11
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v50, v15, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v52, v32, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v31, v15, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v13
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v11 :: v_dual_lshlrev_b32 v28, 16, v54
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v64
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v29, v28, v12 :: v_dual_lshlrev_b32 v50, 16, v11
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v30, v13, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v30, v13 :: v_dual_lshlrev_b32 v32, 16, v27
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX12-FAKE16-NEXT:    v_perm_b32 v13, v38, v13, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v26 :: v_dual_lshlrev_b32 v29, 16, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v54, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v50, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v27, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v29, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v39, v12, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v26
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v30
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v10
-; GFX12-FAKE16-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v26, v10 :: v_dual_lshlrev_b32 v30, 16, v8
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v26, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v28, v11, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v11, v51, v11, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v55, v54
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v29, 16, v25
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v25, v9, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v32, v29
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v25, v9 :: v_dual_lshlrev_b32 v29, 16, v7
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v24
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v10, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v27, v10, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
+; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v54, v10, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v26, 16, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX12-FAKE16-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v30, v28
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v24, v8 :: v_dual_lshlrev_b32 v28, 16, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v23, v7 :: v_dual_lshlrev_b32 v26, 16, v24
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v26, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v65, v9, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v27, v9 :: v_dual_lshlrev_b32 v28, 16, v23
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v27, 16, v23
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v26
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v24, v8 :: v_dual_lshlrev_b32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v29, v28
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v23, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v24, 16, v26
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v25, v8 :: v_dual_lshlrev_b32 v27, 16, v22
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v66, v8, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v22, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v26, v8, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v28, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v22, v6 :: v_dual_lshlrev_b32 v27, 16, v20
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v21
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX12-FAKE16-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v27, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v68, v7, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v21 :: v_dual_lshlrev_b32 v24, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v20 :: v_dual_lshlrev_b32 v25, 16, v5
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v21, v5 :: v_dual_lshlrev_b32 v25, 16, v19
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v4 :: v_dual_lshlrev_b32 v23, 16, v26
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v26, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v21, v5 :: v_dual_lshlrev_b32 v22, 16, v19
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v37, v6, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v20, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v22, v4 :: v_dual_lshlrev_b32 v21, 16, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v19, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v23, v5, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v21
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v18 :: v_dual_lshlrev_b32 v23, 16, v24
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v24, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v0
-; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v31, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v16
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v1 :: v_dual_lshlrev_b32 v20, 16, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v18
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v17
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v23, v19
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v18, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v24, v20
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v0 :: v_dual_lshlrev_b32 v19, 16, v17
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v22
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v18, v2 :: v_dual_lshlrev_b32 v25, 16, v16
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v19
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v2 :: v_dual_lshlrev_b32 v23, 16, v16
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v25, v23
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v16, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v55, v3, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v20, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_lshlrev_b32 v23, 16, v24
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v25
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v20, v1 :: v_dual_lshlrev_b32 v16, 16, v19
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v50, v1, 0x5040100
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v23, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v52, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v48, v1, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v20, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v34, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v32, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v33, v2, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v22, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v15, v4, 0x5040100
-; GFX12-FAKE16-NEXT:    v_perm_b32 v15, v33, v51, 0x5040100
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v24, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v53, v4, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <32 x bfloat> @llvm.minimumnum.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y)
   ret <32 x bfloat> %result
@@ -14580,14 +13015,11 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14596,22 +13028,19 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14619,27 +13048,24 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14655,11 +13081,9 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -14684,16 +13108,13 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14714,11 +13135,9 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -14750,19 +13169,15 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14790,13 +13205,10 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@@ -14829,23 +13241,20 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14855,12 +13264,10 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -14872,23 +13279,20 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -14898,12 +13302,10 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
@@ -14915,27 +13317,24 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX950-NEXT:    s_nop 0
@@ -14948,15 +13347,11 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v2, v0, s0
@@ -14985,18 +13380,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
@@ -15028,36 +13419,31 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15071,39 +13457,36 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15139,34 +13522,30 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
@@ -15188,46 +13567,41 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v0 :: v_dual_lshlrev_b32 v4, 16, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v3, v2 :: v_dual_lshlrev_b32 v7, 16, v1
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_lshlrev_b32 v7, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v2 :: v_dual_lshlrev_b32 v3, 16, v4
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -15267,23 +13641,20 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15293,12 +13664,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15310,12 +13679,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -15327,23 +13694,20 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15353,12 +13717,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15370,12 +13732,10 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
@@ -15387,27 +13747,24 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -15420,14 +13777,11 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
@@ -15443,15 +13797,11 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v4, v0, s0
@@ -15488,29 +13838,23 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -15520,65 +13864,65 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -15587,59 +13931,53 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -15651,76 +13989,70 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v8, v8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s0
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v11
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -15733,75 +14065,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v4 :: v_dual_lshlrev_b32 v6, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v9, 16, v3
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v10
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v10, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v5, v4 :: v_dual_lshlrev_b32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v7
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v3, v1 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v1 :: v_dual_lshlrev_b32 v2, 16, v6
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
   ret <3 x bfloat> %result
@@ -15843,42 +14167,37 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX8-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15888,12 +14207,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15905,12 +14222,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -15924,42 +14239,37 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX900-NEXT:    s_movk_i32 s4, 0x8000
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -15969,12 +14279,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
@@ -15986,12 +14294,10 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s4
@@ -16004,51 +14310,46 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX950-NEXT:    v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT:    s_movk_i32 s0, 0x8000
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v7
-; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v6, v5, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v8
-; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v7, v6, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v9
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX950-NEXT:    s_nop 0
@@ -16061,20 +14362,17 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v7
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
@@ -16084,15 +14382,11 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v3
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX950-NEXT:    v_cmp_eq_u16_e32 vcc, s0, v2
-; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-NEXT:    v_perm_b32 v1, v4, v1, s0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX950-NEXT:    v_perm_b32 v0, v5, v0, s0
@@ -16121,57 +14415,49 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v14
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -16182,84 +14468,76 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v11
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v9, v14
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v13, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
@@ -16285,63 +14563,54 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v5, v4 :: v_dual_and_b32 v9, 0xffff0000, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16355,98 +14624,86 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v10, v11
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v9, v14
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v13, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.l, v0.l, s2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s3
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
@@ -16479,78 +14736,69 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v13, 16, v3
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v6 :: v_dual_lshlrev_b32 v14, 16, v0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v5 :: v_dual_lshlrev_b32 v9, 16, v7
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v15, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v10, v4 :: v_dual_lshlrev_b32 v5, 16, v6
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v9
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v10
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v7
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_lshlrev_b32 v6, 16, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v6 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
diff --git a/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll
index 7aaf00f871136..a4a57e64c48cb 100644
--- a/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll
@@ -17,18 +17,16 @@ define float @maximumnum_float(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_float:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.s $f0, $f13
 ; MIPS64R2-NEXT:    c.un.s $f12, $f12
 ; MIPS64R2-NEXT:    movt.s $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.s $f13, $f13
-; MIPS64R2-NEXT:    movt.s $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.ule.s $f12, $f13
-; MIPS64R2-NEXT:    mov.s $f0, $f13
-; MIPS64R2-NEXT:    movf.s $f0, $f12, $fcc0
+; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
 ; MIPS64R2-NEXT:    mov.s $f1, $f0
 ; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
+; MIPS64R2-NEXT:    c.ule.s $f12, $f0
+; MIPS64R2-NEXT:    movf.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
@@ -67,14 +65,12 @@ define float @maximumnum_float_nnan(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_float_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.ule.s $f12, $f13
 ; MIPS64R2-NEXT:    mov.s $f0, $f13
-; MIPS64R2-NEXT:    movf.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
-; MIPS64R2-NEXT:    mov.s $f1, $f0
+; MIPS64R2-NEXT:    mov.s $f1, $f13
 ; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
+; MIPS64R2-NEXT:    c.ule.s $f12, $f13
+; MIPS64R2-NEXT:    movf.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
@@ -94,18 +90,16 @@ define double @maximumnum_double(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_double:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.d $f0, $f13
 ; MIPS64R2-NEXT:    c.un.d $f12, $f12
 ; MIPS64R2-NEXT:    movt.d $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.d $f13, $f13
-; MIPS64R2-NEXT:    movt.d $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.ule.d $f12, $f13
-; MIPS64R2-NEXT:    mov.d $f0, $f13
-; MIPS64R2-NEXT:    movf.d $f0, $f12, $fcc0
+; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmfc1 $1, $f12
 ; MIPS64R2-NEXT:    mov.d $f1, $f0
 ; MIPS64R2-NEXT:    movz.d $f1, $f12, $1
-; MIPS64R2-NEXT:    dmfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
+; MIPS64R2-NEXT:    c.ule.d $f12, $f0
+; MIPS64R2-NEXT:    movf.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
@@ -144,14 +138,12 @@ define double @maximumnum_double_nnan(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: maximumnum_double_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.ule.d $f12, $f13
 ; MIPS64R2-NEXT:    mov.d $f0, $f13
-; MIPS64R2-NEXT:    movf.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmfc1 $1, $f12
-; MIPS64R2-NEXT:    mov.d $f1, $f0
+; MIPS64R2-NEXT:    mov.d $f1, $f13
 ; MIPS64R2-NEXT:    movz.d $f1, $f12, $1
-; MIPS64R2-NEXT:    dmfc1 $1, $f13
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
+; MIPS64R2-NEXT:    c.ule.d $f12, $f13
+; MIPS64R2-NEXT:    movf.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
@@ -170,21 +162,16 @@ define float @minimumnum_float(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_float:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.s $f0, $f13
 ; MIPS64R2-NEXT:    c.un.s $f12, $f12
 ; MIPS64R2-NEXT:    movt.s $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.s $f13, $f13
-; MIPS64R2-NEXT:    movt.s $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.olt.s $f12, $f13
-; MIPS64R2-NEXT:    mov.s $f0, $f13
 ; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
-; MIPS64R2-NEXT:    lui $2, 32768
-; MIPS64R2-NEXT:    xor $1, $1, $2
 ; MIPS64R2-NEXT:    mov.s $f1, $f0
-; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    xor $1, $1, $2
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
+; MIPS64R2-NEXT:    movn.s $f1, $f12, $1
+; MIPS64R2-NEXT:    c.olt.s $f12, $f0
+; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
@@ -223,17 +210,12 @@ define float @minimumnum_float_nnan(float %x, float %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_float_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.olt.s $f12, $f13
 ; MIPS64R2-NEXT:    mov.s $f0, $f13
-; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mfc1 $1, $f12
-; MIPS64R2-NEXT:    lui $2, 32768
-; MIPS64R2-NEXT:    xor $1, $1, $2
-; MIPS64R2-NEXT:    mov.s $f1, $f0
-; MIPS64R2-NEXT:    movz.s $f1, $f12, $1
-; MIPS64R2-NEXT:    mfc1 $1, $f13
-; MIPS64R2-NEXT:    xor $1, $1, $2
-; MIPS64R2-NEXT:    movz.s $f1, $f13, $1
+; MIPS64R2-NEXT:    mov.s $f1, $f13
+; MIPS64R2-NEXT:    movn.s $f1, $f12, $1
+; MIPS64R2-NEXT:    c.olt.s $f12, $f13
+; MIPS64R2-NEXT:    movt.s $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    mtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.s $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
@@ -252,22 +234,16 @@ define double @minimumnum_double(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_double:
 ; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.d $f0, $f13
 ; MIPS64R2-NEXT:    c.un.d $f12, $f12
 ; MIPS64R2-NEXT:    movt.d $f12, $f13, $fcc0
 ; MIPS64R2-NEXT:    c.un.d $f13, $f13
-; MIPS64R2-NEXT:    movt.d $f13, $f12, $fcc0
-; MIPS64R2-NEXT:    c.olt.d $f12, $f13
-; MIPS64R2-NEXT:    mov.d $f0, $f13
 ; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmfc1 $1, $f12
-; MIPS64R2-NEXT:    daddiu $2, $zero, 1
-; MIPS64R2-NEXT:    dsll $2, $2, 63
-; MIPS64R2-NEXT:    xor $1, $1, $2
 ; MIPS64R2-NEXT:    mov.d $f1, $f0
-; MIPS64R2-NEXT:    movz.d $f1, $f12, $1
-; MIPS64R2-NEXT:    dmfc1 $1, $f13
-; MIPS64R2-NEXT:    xor $1, $1, $2
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
+; MIPS64R2-NEXT:    movn.d $f1, $f12, $1
+; MIPS64R2-NEXT:    c.olt.d $f12, $f0
+; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
@@ -306,18 +282,12 @@ define double @minimumnum_double_nnan(double %x, double %y) {
 ;
 ; MIPS64R2-LABEL: minimumnum_double_nnan:
 ; MIPS64R2:       # %bb.0:
-; MIPS64R2-NEXT:    c.olt.d $f12, $f13
 ; MIPS64R2-NEXT:    mov.d $f0, $f13
+; MIPS64R2-NEXT:    dmfc1 $1, $f12
+; MIPS64R2-NEXT:    mov.d $f1, $f13
+; MIPS64R2-NEXT:    movn.d $f1, $f12, $1
+; MIPS64R2-NEXT:    c.olt.d $f12, $f13
 ; MIPS64R2-NEXT:    movt.d $f0, $f12, $fcc0
-; MIPS64R2-NEXT:    daddiu $1, $zero, 1
-; MIPS64R2-NEXT:    dsll $1, $1, 63
-; MIPS64R2-NEXT:    dmfc1 $2, $f12
-; MIPS64R2-NEXT:    xor $2, $2, $1
-; MIPS64R2-NEXT:    mov.d $f1, $f0
-; MIPS64R2-NEXT:    movz.d $f1, $f12, $2
-; MIPS64R2-NEXT:    dmfc1 $2, $f13
-; MIPS64R2-NEXT:    xor $1, $2, $1
-; MIPS64R2-NEXT:    movz.d $f1, $f13, $1
 ; MIPS64R2-NEXT:    dmtc1 $zero, $f2
 ; MIPS64R2-NEXT:    c.eq.d $f0, $f2
 ; MIPS64R2-NEXT:    jr $ra
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 33bc93d0fe4db..dfb339e2f5f8f 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1813,14 +1813,14 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-LABEL: test_fmaximumnum_v4f16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    subq $56, %rsp
-; AVX512-NEXT:    vmovdqa %xmm1, %xmm5
-; AVX512-NEXT:    vmovdqa %xmm0, %xmm6
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm6
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm5
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vucomiss %xmm0, %xmm0
 ; AVX512-NEXT:    setp %al
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vucomiss %xmm1, %xmm1
 ; AVX512-NEXT:    setp %al
@@ -1829,245 +1829,255 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
 ; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm11
+; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm0
 ; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vucomiss %xmm0, %xmm1
 ; AVX512-NEXT:    seta %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vucomiss %xmm0, %xmm0
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
+; AVX512-NEXT:    setp %cl
+; AVX512-NEXT:    kmovw %ecx, %k1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vucomiss %xmm1, %xmm1
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    setp %cl
+; AVX512-NEXT:    kmovw %ecx, %k2
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k2}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
 ; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm13
+; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm0
 ; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vucomiss %xmm0, %xmm1
-; AVX512-NEXT:    seta %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT:    seta %cl
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %dl
+; AVX512-NEXT:    kmovw %edx, %k1
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vucomiss %xmm1, %xmm1
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT:    vucomiss %xmm2, %xmm2
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k2
-; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    setp %dl
+; AVX512-NEXT:    kmovw %edx, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k2}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
 ; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vucomiss %xmm1, %xmm2
-; AVX512-NEXT:    seta %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm3
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm5[1,0]
-; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vucomiss %xmm1, %xmm1
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm6[1,0]
-; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT:    vucomiss %xmm2, %xmm2
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k2
-; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm13
-; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm2
-; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm12
-; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm1
-; AVX512-NEXT:    vucomiss %xmm1, %xmm2
-; AVX512-NEXT:    seta %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm14
+; AVX512-NEXT:    vcvtph2ps %xmm14, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vucomiss %xmm0, %xmm1
+; AVX512-NEXT:    seta %dl
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm15
+; AVX512-NEXT:    vucomiss %xmm15, %xmm15
+; AVX512-NEXT:    setp %sil
+; AVX512-NEXT:    kmovw %esi, %k1
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm5[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %sil
+; AVX512-NEXT:    kmovw %esi, %k2
+; AVX512-NEXT:    vmovss %xmm15, %xmm0, %xmm0 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm12
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm3
+; AVX512-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovss %xmm3, %xmm15, %xmm15 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm15, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm13
+; AVX512-NEXT:    vucomiss %xmm13, %xmm3
+; AVX512-NEXT:    seta %sil
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpsrlq $48, %xmm5, %xmm0
+; AVX512-NEXT:    vpsrlq $48, %xmm6, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vucomiss %xmm0, %xmm0
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vpsrlq $48, %xmm6, %xmm1
+; AVX512-NEXT:    setp %dil
+; AVX512-NEXT:    kmovw %edi, %k1
+; AVX512-NEXT:    vpsrlq $48, %xmm5, %xmm1
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vucomiss %xmm1, %xmm1
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    setp %dil
+; AVX512-NEXT:    kmovw %edi, %k2
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm11
-; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm1
-; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm15
-; AVX512-NEXT:    vcvtph2ps %xmm15, %xmm7
-; AVX512-NEXT:    vucomiss %xmm7, %xmm1
-; AVX512-NEXT:    seta %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm1, %xmm7, %xmm7 {%k1}
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT:    vucomiss %xmm0, %xmm0
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm10
+; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm14
+; AVX512-NEXT:    vmovss %xmm14, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm15
+; AVX512-NEXT:    vucomiss %xmm15, %xmm14
+; AVX512-NEXT:    seta %dil
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm6[1,1,3,3]
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vucomiss %xmm1, %xmm1
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k2
-; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm9
-; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm4
+; AVX512-NEXT:    setp %r8b
+; AVX512-NEXT:    kmovw %r8d, %k1
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %r8b
+; AVX512-NEXT:    kmovw %r8d, %k2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm7
+; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm8
+; AVX512-NEXT:    vmovss %xmm8, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm8
+; AVX512-NEXT:    seta %r8b
+; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %r9b
+; AVX512-NEXT:    kmovw %r9d, %k1
+; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %r9b
+; AVX512-NEXT:    kmovw %r9d, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm4
 ; AVX512-NEXT:    vmovss %xmm4, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm10
-; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm3
-; AVX512-NEXT:    vucomiss %xmm3, %xmm4
-; AVX512-NEXT:    seta %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm4, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm0
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm9
+; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm1
+; AVX512-NEXT:    vucomiss %xmm1, %xmm4
+; AVX512-NEXT:    seta %r9b
+; AVX512-NEXT:    vpsrld $16, %xmm6, %xmm6
+; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm0
 ; AVX512-NEXT:    vucomiss %xmm0, %xmm0
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm4
-; AVX512-NEXT:    vucomiss %xmm4, %xmm4
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k2
-; AVX512-NEXT:    vmovss %xmm0, %xmm4, %xmm4 {%k2}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm1
-; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm8
-; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm2
-; AVX512-NEXT:    vucomiss %xmm2, %xmm1
-; AVX512-NEXT:    seta %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT:    vpsrld $16, %xmm5, %xmm1
-; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vucomiss %xmm1, %xmm1
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vpsrld $16, %xmm6, %xmm5
+; AVX512-NEXT:    setp %r10b
+; AVX512-NEXT:    kmovw %r10d, %k1
+; AVX512-NEXT:    vpsrld $16, %xmm5, %xmm5
 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
 ; AVX512-NEXT:    vucomiss %xmm5, %xmm5
-; AVX512-NEXT:    setp %al
-; AVX512-NEXT:    kmovw %eax, %k2
-; AVX512-NEXT:    vmovss %xmm1, %xmm5, %xmm5 {%k2}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm5, %xmm6
-; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm5
-; AVX512-NEXT:    vmovss %xmm5, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    setp %r10b
+; AVX512-NEXT:    kmovw %r10d, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm5, %xmm5 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
+; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm6
+; AVX512-NEXT:    vmovss %xmm6, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm9 = xmm9[0],mem[0]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm11[0]
+; AVX512-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT:    vpcmpeqw %xmm5, %xmm2, %xmm5
+; AVX512-NEXT:    vpblendvb %xmm5, %xmm2, %xmm9, %xmm7
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vmovss %xmm5, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    kmovw %ecx, %k1
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vmovaps (%rsp), %xmm9 # 16-byte Reload
+; AVX512-NEXT:    vmovss %xmm9, %xmm5, %xmm5 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm10
+; AVX512-NEXT:    vcvtps2ph $4, %xmm5, %xmm11
+; AVX512-NEXT:    kmovw %edx, %k1
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vmovss %xmm5, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm9
+; AVX512-NEXT:    kmovw %esi, %k1
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vmovss %xmm2, %xmm13, %xmm13 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm13, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1]
+; AVX512-NEXT:    kmovw %edi, %k1
+; AVX512-NEXT:    vmovss %xmm14, %xmm15, %xmm15 {%k1}
+; AVX512-NEXT:    kmovw %r8d, %k1
+; AVX512-NEXT:    vmovss %xmm8, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm15, %xmm8
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT:    kmovw %r9d, %k1
+; AVX512-NEXT:    vmovss %xmm4, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm0
-; AVX512-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm6
 ; AVX512-NEXT:    seta %al
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcvtps2ph $4, %xmm7, %xmm7
-; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
-; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm5
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3]
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm13[0]
-; AVX512-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512-NEXT:    vpcmpeqw %xmm6, %xmm4, %xmm9
-; AVX512-NEXT:    vpblendvb %xmm9, %xmm4, %xmm0, %xmm4
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3]
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm11 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0]
-; AVX512-NEXT:    vpcmpeqw %xmm6, %xmm1, %xmm6
-; AVX512-NEXT:    vpblendvb %xmm6, %xmm1, %xmm4, %xmm1
-; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm6, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm4
 ; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
+; AVX512-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT:    vucomiss %xmm5, %xmm4
 ; AVX512-NEXT:    movl $65535, %ecx # imm = 0xFFFF
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovel %ecx, %edx
-; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm2
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovel %ecx, %esi
-; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm2
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
-; AVX512-NEXT:    movl $0, %edi
-; AVX512-NEXT:    cmovel %ecx, %edi
-; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm2
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
-; AVX512-NEXT:    movl $0, %r8d
-; AVX512-NEXT:    cmovel %ecx, %r8d
-; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
-; AVX512-NEXT:    movl $0, %r9d
-; AVX512-NEXT:    cmovel %ecx, %r9d
-; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
-; AVX512-NEXT:    movl $0, %r10d
-; AVX512-NEXT:    cmovel %ecx, %r10d
-; AVX512-NEXT:    vcvtph2ps (%rsp), %xmm2 # 16-byte Folded Reload
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
-; AVX512-NEXT:    movl $0, %r11d
-; AVX512-NEXT:    cmovel %ecx, %r11d
-; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX512-NEXT:    vucomiss %xmm4, %xmm2
-; AVX512-NEXT:    vmovd %esi, %xmm2
-; AVX512-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
-; AVX512-NEXT:    vpinsrw $2, %edi, %xmm2, %xmm2
-; AVX512-NEXT:    vpinsrw $3, %r8d, %xmm2, %xmm2
-; AVX512-NEXT:    vpinsrw $4, %r9d, %xmm2, %xmm2
-; AVX512-NEXT:    vpinsrw $5, %r10d, %xmm2, %xmm2
-; AVX512-NEXT:    vpinsrw $6, %r11d, %xmm2, %xmm2
-; AVX512-NEXT:    cmovel %ecx, %eax
-; AVX512-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    cmovpl %eax, %ecx
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vucomiss %xmm5, %xmm1
+; AVX512-NEXT:    movl $65535, %edx # imm = 0xFFFF
+; AVX512-NEXT:    cmovnel %eax, %edx
+; AVX512-NEXT:    cmovpl %eax, %edx
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm1
+; AVX512-NEXT:    vucomiss %xmm5, %xmm1
+; AVX512-NEXT:    movl $65535, %esi # imm = 0xFFFF
+; AVX512-NEXT:    cmovnel %eax, %esi
+; AVX512-NEXT:    cmovpl %eax, %esi
+; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm1
+; AVX512-NEXT:    vucomiss %xmm5, %xmm1
+; AVX512-NEXT:    movl $65535, %edi # imm = 0xFFFF
+; AVX512-NEXT:    cmovnel %eax, %edi
+; AVX512-NEXT:    cmovpl %eax, %edi
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm1
+; AVX512-NEXT:    vucomiss %xmm5, %xmm1
+; AVX512-NEXT:    movl $65535, %r8d # imm = 0xFFFF
+; AVX512-NEXT:    cmovnel %eax, %r8d
+; AVX512-NEXT:    cmovpl %eax, %r8d
+; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm1
+; AVX512-NEXT:    vucomiss %xmm5, %xmm1
+; AVX512-NEXT:    movl $65535, %r9d # imm = 0xFFFF
+; AVX512-NEXT:    cmovnel %eax, %r9d
+; AVX512-NEXT:    cmovpl %eax, %r9d
+; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm1
+; AVX512-NEXT:    vucomiss %xmm5, %xmm1
+; AVX512-NEXT:    movl $65535, %r10d # imm = 0xFFFF
+; AVX512-NEXT:    cmovnel %eax, %r10d
+; AVX512-NEXT:    cmovpl %eax, %r10d
+; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm1
+; AVX512-NEXT:    vucomiss %xmm5, %xmm1
+; AVX512-NEXT:    vmovd %edx, %xmm1
+; AVX512-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $2, %esi, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $4, %r8d, %xmm1, %xmm1
+; AVX512-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; AVX512-NEXT:    vpinsrw $5, %r9d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $6, %r10d, %xmm1, %xmm1
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    cmovpl %eax, %ecx
+; AVX512-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm7, %xmm0, %xmm0
 ; AVX512-NEXT:    addq $56, %rsp
 ; AVX512-NEXT:    retq
 ;
@@ -2667,3 +2677,186 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
   %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
   ret <4 x bfloat> %r
 }
+
+define fp128 @maximumnum_fp128(fp128 %x, fp128 %y) nounwind {
+; SSE2-LABEL: maximumnum_fp128:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    subq $72, %rsp
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    callq __unordtf2 at PLT
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    jne .LBB35_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB35_2:
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    callq __unordtf2 at PLT
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    jne .LBB35_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:  .LBB35_4:
+; SSE2-NEXT:    cmpb $0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT:    je .LBB35_6
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:  .LBB35_6:
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    callq __gttf2 at PLT
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    jg .LBB35_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:  .LBB35_8:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    callq __eqtf2 at PLT
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    je .LBB35_10
+; SSE2-NEXT:  # %bb.9:
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:  .LBB35_10:
+; SSE2-NEXT:    addq $72, %rsp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: maximumnum_fp128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    subq $72, %rsp
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    callq __unordtf2 at PLT
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    jne .LBB35_2
+; AVX-NEXT:  # %bb.1:
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB35_2:
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    callq __unordtf2 at PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps %xmm0, %xmm1
+; AVX-NEXT:    jne .LBB35_4
+; AVX-NEXT:  # %bb.3:
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:  .LBB35_4:
+; AVX-NEXT:    cmpb $0, {{[0-9]+}}(%rsp)
+; AVX-NEXT:    je .LBB35_6
+; AVX-NEXT:  # %bb.5:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:  .LBB35_6:
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    callq __gttf2 at PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    jg .LBB35_8
+; AVX-NEXT:  # %bb.7:
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:  .LBB35_8:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __eqtf2 at PLT
+; AVX-NEXT:    testl %eax, %eax
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    je .LBB35_10
+; AVX-NEXT:  # %bb.9:
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:  .LBB35_10:
+; AVX-NEXT:    addq $72, %rsp
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: maximumnum_fp128:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    subq $72, %rsp
+; AVX10_2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    callq __unordtf2 at PLT
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    jne .LBB35_2
+; AVX10_2-NEXT:  # %bb.1:
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB35_2:
+; AVX10_2-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    callq __unordtf2 at PLT
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT:    jne .LBB35_4
+; AVX10_2-NEXT:  # %bb.3:
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB35_4:
+; AVX10_2-NEXT:    cmpb $0, {{[0-9]+}}(%rsp)
+; AVX10_2-NEXT:    je .LBB35_6
+; AVX10_2-NEXT:  # %bb.5:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:  .LBB35_6:
+; AVX10_2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT:    callq __gttf2 at PLT
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    jg .LBB35_8
+; AVX10_2-NEXT:  # %bb.7:
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB35_8:
+; AVX10_2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT:    callq __eqtf2 at PLT
+; AVX10_2-NEXT:    testl %eax, %eax
+; AVX10_2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:    je .LBB35_10
+; AVX10_2-NEXT:  # %bb.9:
+; AVX10_2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT:  .LBB35_10:
+; AVX10_2-NEXT:    addq $72, %rsp
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: maximumnum_fp128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    vmovups 12(%ebp), %ymm0
+; X86-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll fmaximum_numl
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %res = call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %res
+}



More information about the llvm-commits mailing list