[llvm] Correctly round FP -> BF16 when SDAG expands such nodes (PR #82399)

David Majnemer via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 20 10:34:30 PST 2024


https://github.com/majnemer created https://github.com/llvm/llvm-project/pull/82399

We did something pretty naive:
- round FP64 -> BF16 by first rounding to FP32
- skip FP32 -> BF16 rounding entirely
- taking the top 16 bits of a FP32 which will turn some NaNs into infinities

Let's do this in a more principled way by rounding types with more precision than FP32 to FP32 using round-inexact-to-odd which will negate double rounding issues.

>From 4a278ac29d39c44cbe291113f10da2b2a6976d66 Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer at gmail.com>
Date: Thu, 15 Feb 2024 18:15:45 +0000
Subject: [PATCH] Correctly round FP -> BF16 when SDAG expands such nodes

We did something pretty naive:
- round FP64 -> BF16 by first rounding to FP32
- skip FP32 -> BF16 rounding entirely
- taking the top 16 bits of a FP32 which will turn some NaNs into infinities

Let's do this in a more principled way by rounding types with more
precision than FP32 to FP32 using round-inexact-to-odd which will
negate double rounding issues.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |    94 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |    53 +
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |     3 +
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |    10 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 15370 +++++++++++++---
 .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll |    16 +-
 llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll |   286 +-
 .../isel-amdgpu-cs-chain-preserve-cc.ll       |  1462 +-
 llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll  |   104 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |   293 +-
 llvm/test/CodeGen/NVPTX/bf16-instructions.ll  |     2 +-
 11 files changed, 14119 insertions(+), 3574 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 252b6e9997a710..3426956a41b3d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3219,8 +3219,98 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::FP_ROUND: {
     EVT VT = Node->getValueType(0);
     if (VT.getScalarType() == MVT::bf16) {
-      Results.push_back(
-          DAG.getNode(ISD::FP_TO_BF16, SDLoc(Node), VT, Node->getOperand(0)));
+      if (Node->getConstantOperandVal(1) == 1) {
+        Results.push_back(
+            DAG.getNode(ISD::FP_TO_BF16, SDLoc(Node), VT, Node->getOperand(0)));
+        break;
+      }
+      SDValue Op = Node->getOperand(0);
+      SDValue IsNaN = DAG.getSetCC(dl, getSetCCResultType(Op.getValueType()),
+                                   Op, Op, ISD::SETUO);
+      if (Op.getValueType() != MVT::f32) {
+        // We are rounding binary64/binary128 -> binary32 -> bfloat16. This
+        // can induce double-rounding which may alter the results. We can
+        // correct for this using a trick explained in: Boldo, Sylvie, and
+        // Guillaume Melquiond. "When double rounding is odd." 17th IMACS
+        // World Congress. 2005.
+        FloatSignAsInt ValueAsInt;
+        getSignAsIntValue(ValueAsInt, dl, Op);
+        EVT WideIntVT = ValueAsInt.IntValue.getValueType();
+        SDValue SignMask = DAG.getConstant(ValueAsInt.SignMask, dl, WideIntVT);
+        SDValue SignBit =
+            DAG.getNode(ISD::AND, dl, WideIntVT, ValueAsInt.IntValue, SignMask);
+        SDValue AbsWide;
+        if (TLI.isOperationLegalOrCustom(ISD::FABS, ValueAsInt.FloatVT)) {
+          AbsWide = DAG.getNode(ISD::FABS, dl, ValueAsInt.FloatVT, Op);
+        } else {
+          SDValue ClearSignMask =
+              DAG.getConstant(~ValueAsInt.SignMask, dl, WideIntVT);
+          SDValue ClearedSign = DAG.getNode(ISD::AND, dl, WideIntVT,
+                                            ValueAsInt.IntValue, ClearSignMask);
+          AbsWide = modifySignAsInt(ValueAsInt, dl, ClearedSign);
+        }
+        SDValue AbsNarrow =
+            DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, AbsWide,
+                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+        SDValue AbsNarrowAsWide =
+            DAG.getNode(ISD::FP_EXTEND, dl, ValueAsInt.FloatVT, AbsNarrow);
+
+        // We can keep the narrow value as-is if narrowing was exact (no
+        // rounding error), the wide value was NaN (the narrow value is also
+        // NaN and should be preserved) or if we rounded to the odd value.
+        SDValue NarrowBits = DAG.getNode(ISD::BITCAST, dl, MVT::i32, AbsNarrow);
+        SDValue One = DAG.getConstant(1, dl, MVT::i32);
+        SDValue NegativeOne = DAG.getConstant(-1, dl, MVT::i32);
+        SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, NarrowBits, One);
+        EVT I32CCVT = getSetCCResultType(And.getValueType());
+        SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+        SDValue AlreadyOdd = DAG.getSetCC(dl, I32CCVT, And, Zero, ISD::SETNE);
+
+        EVT WideSetCCVT = getSetCCResultType(AbsWide.getValueType());
+        SDValue KeepNarrow = DAG.getSetCC(dl, WideSetCCVT, AbsWide,
+                                          AbsNarrowAsWide, ISD::SETUEQ);
+        KeepNarrow =
+            DAG.getNode(ISD::OR, dl, WideSetCCVT, KeepNarrow, AlreadyOdd);
+        // We morally performed a round-down if `abs_narrow` is smaller than
+        // `abs_wide`.
+        SDValue NarrowIsRd = DAG.getSetCC(dl, WideSetCCVT, AbsWide,
+                                          AbsNarrowAsWide, ISD::SETOGT);
+        // If the narrow value is odd or exact, pick it.
+        // Otherwise, narrow is even and corresponds to either the rounded-up
+        // or rounded-down value. If narrow is the rounded-down value, we want
+        // the rounded-up value as it will be odd.
+        SDValue Adjust =
+            DAG.getSelect(dl, MVT::i32, NarrowIsRd, One, NegativeOne);
+        Adjust = DAG.getSelect(dl, MVT::i32, KeepNarrow, Zero, Adjust);
+        int ShiftAmount = ValueAsInt.SignBit - 31;
+        SDValue ShiftCnst = DAG.getConstant(
+            ShiftAmount, dl,
+            TLI.getShiftAmountTy(WideIntVT, DAG.getDataLayout()));
+        SignBit = DAG.getNode(ISD::SRL, dl, WideIntVT, SignBit, ShiftCnst);
+        SignBit = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, SignBit);
+        Op = DAG.getNode(ISD::OR, dl, MVT::i32, Adjust, SignBit);
+      } else {
+        Op = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+      }
+
+      SDValue One = DAG.getConstant(1, dl, MVT::i32);
+      SDValue Lsb = DAG.getNode(
+          ISD::SRL, dl, MVT::i32, Op,
+          DAG.getConstant(16, dl,
+                          TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+      Lsb = DAG.getNode(ISD::AND, dl, MVT::i32, Lsb, One);
+      SDValue RoundingBias = DAG.getNode(
+          ISD::ADD, dl, MVT::i32, DAG.getConstant(0x7fff, dl, MVT::i32), Lsb);
+      SDValue Add = DAG.getNode(ISD::ADD, dl, MVT::i32, Op, RoundingBias);
+      Op = DAG.getNode(
+          ISD::SRL, dl, MVT::i32, Add,
+          DAG.getConstant(16, dl,
+                          TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+      Op = DAG.getSelect(dl, MVT::i32, IsNaN,
+                         DAG.getConstant(0x00007fc0, dl, MVT::i32), Op);
+
+      Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Op);
+      Results.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::bf16, Op));
       break;
     }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7f58b312e7a201..e75799ca13b0bb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -776,6 +776,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
       AddPromotedToType(Op, MVT::bf16, MVT::f32);
   }
 
+  if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
+    setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+    setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);
+  }
+
   // sm_80 only has conversions between f32 and bf16. Custom lower all other
   // bf16 conversions.
   if (STI.hasBF16Math() &&
@@ -2465,6 +2470,50 @@ SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
   return Op;
 }
 
+SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::bf16) {
+    if (Op.getOperand(0).getValueType() == MVT::f32 &&
+        (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70)) {
+      SDLoc Loc(Op);
+      return DAG.getNode(ISD::FP_TO_BF16, Loc, MVT::bf16, Op.getOperand(0));
+    }
+    if (Op.getOperand(0).getValueType() == MVT::f64 &&
+        (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
+      SDLoc Loc(Op);
+      return DAG.getNode(ISD::FP_TO_BF16, Loc, MVT::bf16, Op.getOperand(0));
+    }
+  }
+
+  // Everything else is considered legal.
+  return Op;
+}
+
+SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() == MVT::bf16) {
+    if (Op.getValueType() == MVT::f32 &&
+        (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
+      SDLoc Loc(Op);
+      return DAG.getNode(ISD::BF16_TO_FP, Loc, Op.getValueType(),
+                         Op.getOperand(0));
+    }
+    if (Op.getValueType() == MVT::f64 &&
+        (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
+      SDLoc Loc(Op);
+      if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
+        Op = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0));
+        return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, Op);
+      }
+      return DAG.getNode(ISD::BF16_TO_FP, Loc, Op.getValueType(),
+                         Op.getOperand(0));
+    }
+  }
+
+  // Everything else is considered legal.
+  return Op;
+}
+
 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   if (Op.getValueType() != MVT::v2i16)
@@ -2527,6 +2576,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return LowerFP_TO_INT(Op, DAG);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
   case ISD::VASTART:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5d3fd992812ef9..cf1d4580766918 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -618,6 +618,9 @@ class NVPTXTargetLowering : public TargetLowering {
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 631136ad621464..40d82ebecbed35 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -662,7 +662,7 @@ let hasSideEffects = false in {
                    // bf16->f32 was introduced early.
                    [hasPTX<71>, hasSM<80>],
                    // bf16->everything else needs sm90/ptx78
-                   [hasPTX<78>, hasSM<90>])>; 
+                   [hasPTX<78>, hasSM<90>])>;
     def _f32 :
       NVPTXInst<(outs RC:$dst),
                 (ins Float32Regs:$src, CvtMode:$mode),
@@ -3647,7 +3647,7 @@ def : Pat<(f16 (fpround Float32Regs:$a)),
 
 // fpround f32 -> bf16
 def : Pat<(bf16 (fpround Float32Regs:$a)),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
+          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
 
 // fpround f64 -> f16
 def : Pat<(f16 (fpround Float64Regs:$a)),
@@ -3655,7 +3655,7 @@ def : Pat<(f16 (fpround Float64Regs:$a)),
 
 // fpround f64 -> bf16
 def : Pat<(bf16 (fpround Float64Regs:$a)),
-          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>;
+          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 // fpround f64 -> f32
 def : Pat<(f32 (fpround Float64Regs:$a)),
           (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
@@ -3671,7 +3671,7 @@ def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
 def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
           (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
-          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>;
+          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
 
 // fpextend f16 -> f64
 def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
@@ -3679,7 +3679,7 @@ def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
 
 // fpextend bf16 -> f64
 def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
-          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>;
+          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
 
 // fpextend f32 -> f64
 def : Pat<(f64 (fpextend Float32Regs:$a)),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 387c4a16a008ae..39cb0a768701c0 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1918,8 +1918,14 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fc0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1928,8 +1934,15 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1938,7 +1951,12 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_load_store_f32_to_bf16:
@@ -1946,7 +1964,14 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load float, ptr addrspace(1) %in
   %val.bf16 = fptrunc float %val to bfloat
@@ -1989,9 +2014,25 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7fc0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX8-NEXT:    v_and_b32_e32 v8, 0x80000000, v1
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v5, v4, v8
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2000,9 +2041,26 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_brev_b32 s8, 1
+; GFX9-NEXT:    s_movk_i32 s9, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX9-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s8, v4
+; GFX9-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v5, s9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2011,8 +2069,22 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
+; GFX10-NEXT:    v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s5
+; GFX10-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s4
+; GFX10-NEXT:    v_and_or_b32 v5, 0x80000000, v1, v4
+; GFX10-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_load_store_f64_to_bf16:
@@ -2020,8 +2092,27 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
+; GFX11-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
+; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v5, 0x80000000, v1, v4
+; GFX11-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load double, ptr addrspace(1) %in
   %val.bf16 = fptrunc double %val to bfloat
@@ -8487,7 +8578,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16:
@@ -8496,7 +8593,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16:
@@ -8505,7 +8608,11 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16:
@@ -8515,7 +8622,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd bfloat %a, %b
   ret bfloat %op
@@ -8553,12 +8666,25 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v2bf16:
@@ -8566,11 +8692,23 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8583,7 +8721,17 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v2bf16:
@@ -8596,8 +8744,22 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -8644,15 +8806,34 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v3bf16:
@@ -8661,32 +8842,98 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -8739,20 +8986,46 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v4bf16:
@@ -8760,17 +9033,39 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -8781,17 +9076,37 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_add_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v4bf16:
@@ -8803,15 +9118,40 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -8897,36 +9237,86 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v8bf16:
@@ -8934,29 +9324,71 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
@@ -8975,58 +9407,151 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_add_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX10-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v3, v3, v7
+; GFX11-NEXT:    v_add_f32_e32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v12 :: v_dual_add_f32 v1, v1, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v11, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_dual_add_f32 v10, v11, v10 :: v_dual_add_f32 v11, v13, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -9180,122 +9705,302 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
 ; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v15, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v15, v18, v15
+; GFX8-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v14, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v14, v18, v14
+; GFX8-NEXT:    v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v14
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v13, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v13, v18, v13
+; GFX8-NEXT:    v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v13
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v12, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v12, v18, v12
+; GFX8-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v12
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v11, v18, v11
+; GFX8-NEXT:    v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v11
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v10, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v10, v18, v10
+; GFX8-NEXT:    v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v10
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v9, v18, v9
+; GFX8-NEXT:    v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v9
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v16, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_fadd_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v18, v15, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v14, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v13, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v18, v12, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v18, v11, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v10, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
@@ -9313,119 +10018,294 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
 ; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v15, v18, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_add_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_add_f32_e32 v14, v20, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX10-NEXT:    v_add_f32_e32 v13, v21, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_add_f32_e32 v17, v20, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT:    v_add_f32_e32 v12, v19, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_add_f32_e32 v11, v19, v18
+; GFX10-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT:    v_add_f32_e32 v10, v21, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v19, v20, v19
-; GFX10-NEXT:    v_add_f32_e32 v20, v22, v21
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT:    v_add_f32_e32 v17, v19, v17
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_add_f32_e32 v13, v21, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v14, v20, v18 :: v_dual_add_f32 v5, v5, v13
+; GFX11-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX11-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT:    v_add_f32_e32 v14, v19, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT:    v_add_f32_e32 v17, v20, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_add_f32 v4, v4, v12
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v12, v19, v18
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX11-NEXT:    v_dual_add_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v11, v19, v18
+; GFX11-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v19, v20, v19 :: v_dual_add_f32 v20, v22, v21
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v17, v19, v17 :: v_dual_add_f32 v2, v2, v10
+; GFX11-NEXT:    v_add_f32_e32 v10, v21, v20
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -9833,247 +10713,716 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX8-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX8-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX8-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX8-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX8-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX8-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v35, s64, 32
+; GFX8-NEXT:    v_writelane_b32 v35, s65, 33
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_writelane_b32 v35, s66, 34
 ; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
 ; GFX8-NEXT:    v_add_f32_e32 v30, v14, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s68, 36
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    s_movk_i32 s68, 0x7fff
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
 ; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_add_f32_e32 v14, v32, v14
-; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_add_f32_e32 v31, v31, v30
+; GFX8-NEXT:    v_add_f32_e32 v29, v13, v29
+; GFX8-NEXT:    v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT:    v_add_f32_e32 v31, v31, v29
+; GFX8-NEXT:    v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_add_f32_e32 v28, v12, v28
+; GFX8-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT:    v_add_f32_e32 v31, v15, v31
+; GFX8-NEXT:    v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT:    v_add_f32_e32 v28, v28, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_add_f32_e32 v34, v34, v31
+; GFX8-NEXT:    v_add_f32_e32 v27, v11, v27
+; GFX8-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_add_f32_e32 v34, v34, v27
+; GFX8-NEXT:    v_add_f32_e32 v26, v10, v26
+; GFX8-NEXT:    v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_add_f32_e32 v34, v34, v26
+; GFX8-NEXT:    v_add_f32_e32 v25, v9, v25
+; GFX8-NEXT:    v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_add_f32_e32 v34, v34, v25
+; GFX8-NEXT:    v_add_f32_e32 v24, v8, v24
+; GFX8-NEXT:    v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v7
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_add_f32_e32 v34, v34, v24
+; GFX8-NEXT:    v_add_f32_e32 v23, v7, v23
+; GFX8-NEXT:    v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_add_f32_e32 v34, v34, v23
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v6, v34, v6
+; GFX8-NEXT:    v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v21, v5
+; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v4, v20, v4
+; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v19, v3
+; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v2, v18, v2
+; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v1, v17, v1
+; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], v0, v28
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], s68, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT:    v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT:    v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT:    v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT:    v_readlane_b32 s68, v35, 36
+; GFX8-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX8-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX8-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX9-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX9-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX9-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX9-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX9-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX9-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX9-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX9-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX9-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v35, s64, 32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s65, 33
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v31
+; GFX9-NEXT:    v_add_f32_e32 v30, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s66, 34
+; GFX9-NEXT:    s_movk_i32 s66, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT:    v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT:    v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v30
+; GFX9-NEXT:    v_add_f32_e32 v29, v13, v29
+; GFX9-NEXT:    v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT:    v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT:    v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v29, v32, v29
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v29
+; GFX9-NEXT:    v_add_f32_e32 v28, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT:    v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT:    v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_add_f32_e32 v28, v32, v28
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v33
+; GFX9-NEXT:    v_add_f32_e32 v27, v11, v27
+; GFX9-NEXT:    v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT:    v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT:    v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_add_f32_e32 v27, v32, v27
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v27
+; GFX9-NEXT:    v_add_f32_e32 v26, v10, v26
+; GFX9-NEXT:    v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT:    v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT:    v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_add_f32_e32 v26, v32, v26
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v26
+; GFX9-NEXT:    v_add_f32_e32 v25, v9, v25
+; GFX9-NEXT:    v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT:    v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT:    v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT:    v_add_f32_e32 v25, v32, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_add_f32_e32 v32, v32, v33
-; GFX9-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v25
+; GFX9-NEXT:    v_add_f32_e32 v24, v8, v24
+; GFX9-NEXT:    v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT:    v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT:    v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v24
+; GFX9-NEXT:    v_add_f32_e32 v23, v7, v23
+; GFX9-NEXT:    v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT:    v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT:    v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v23
+; GFX9-NEXT:    v_add_f32_e32 v22, v6, v22
+; GFX9-NEXT:    v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT:    v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT:    v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[40:41], v22, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v22
+; GFX9-NEXT:    v_add_f32_e32 v21, v5, v21
+; GFX9-NEXT:    v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT:    v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT:    v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[44:45], v21, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v21
+; GFX9-NEXT:    v_add_f32_e32 v20, v4, v20
+; GFX9-NEXT:    v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT:    v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT:    v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[48:49], v20, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v20
+; GFX9-NEXT:    v_add_f32_e32 v19, v3, v19
+; GFX9-NEXT:    v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT:    v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT:    v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[52:53], v19, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v19
+; GFX9-NEXT:    v_add_f32_e32 v18, v2, v18
+; GFX9-NEXT:    v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT:    v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT:    v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[56:57], v18, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v18
+; GFX9-NEXT:    v_add_f32_e32 v17, v1, v17
+; GFX9-NEXT:    v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT:    v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT:    v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[60:61], v17, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_add_f32_e32 v16, v0, v16
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v34
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v17
+; GFX9-NEXT:    v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT:    v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
@@ -10082,13 +11431,72 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
 ; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
 ; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX9-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -10101,14 +11509,18 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
 ; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
@@ -10121,24 +11533,6 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
@@ -10149,20 +11543,88 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_add_f32_e32 v19, v28, v38
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_add_f32_e32 v18, v27, v48
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_add_f32_e32 v17, v26, v50
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_add_f32_e32 v20, v29, v36
+; GFX10-NEXT:    v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT:    v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT:    v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT:    v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT:    v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT:    v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT:    v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT:    v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT:    v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT:    v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT:    v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT:    v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
 ; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
 ; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
 ; GFX10-NEXT:    v_add_f32_e32 v25, v54, v53
@@ -10172,142 +11634,423 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_add_f32_e32 v23, v66, v65
 ; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
 ; GFX10-NEXT:    v_add_f32_e32 v22, v68, v67
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_add_f32_e32 v21, v30, v34
-; GFX10-NEXT:    v_add_f32_e32 v29, v29, v36
-; GFX10-NEXT:    v_add_f32_e32 v28, v28, v38
-; GFX10-NEXT:    v_add_f32_e32 v27, v27, v48
-; GFX10-NEXT:    v_add_f32_e32 v26, v26, v50
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT:    v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT:    v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT:    v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT:    v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT:    v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT:    v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT:    v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT:    v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT:    v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT:    v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT:    v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT:    v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT:    v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT:    v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT:    v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT:    v_add_f32_e32 v17, v31, v33
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v32
+; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT:    v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
 ; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_add_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_add_f32 v4, v4, v20
+; GFX11-NEXT:    v_add_f32_e32 v21, v70, v69
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_add_f32 v1, v1, v17
+; GFX11-NEXT:    v_add_f32_e32 v17, v86, v85
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_add_f32_e32 v22, v68, v67
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_add_f32_e32 v27, v50, v49
+; GFX11-NEXT:    v_bfe_u32 v50, v10, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v18 :: v_dual_add_f32 v3, v3, v19
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX11-NEXT:    v_add_f32_e32 v26, v52, v51
-; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_add_f32_e32 v20, v80, v71
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_add_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_dual_add_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_dual_add_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
 ; GFX11-NEXT:    v_add_f32_e32 v25, v54, v53
-; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_add_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_add_f32_e32 v23, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_add_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_add_f32_e32 v29, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_add_f32_e32 v37, v86, v85
-; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_add_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
-; GFX11-NEXT:    v_dual_add_f32 v34, v80, v71 :: v_dual_add_f32 v35, v82, v81
-; GFX11-NEXT:    v_add_f32_e32 v36, v84, v83
-; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT:    v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT:    v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT:    v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT:    v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT:    v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT:    v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT:    v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT:    v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT:    v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT:    v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT:    v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT:    v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT:    v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT:    v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT:    v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT:    v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT:    v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT:    v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT:    v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT:    v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT:    v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT:    v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT:    v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT:    v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT:    v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT:    v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT:    v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT:    v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT:    v_add_f32_e32 v31, v31, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v15, v15, v32
+; GFX11-NEXT:    v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT:    v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <32 x bfloat> %a, %b
   ret <32 x bfloat> %op
@@ -10335,7 +12078,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16_fpimm_0:
@@ -10343,7 +12092,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_0:
@@ -10351,7 +12106,11 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16_fpimm_0:
@@ -10360,7 +12119,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd bfloat %arg0, 1.0
   ret bfloat %add
@@ -10388,7 +12153,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16_fpimm_1:
@@ -10396,7 +12167,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_1:
@@ -10404,7 +12181,11 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16_fpimm_1:
@@ -10413,7 +12194,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd bfloat %arg0, 42.0
   ret bfloat %add
@@ -10444,7 +12231,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_bf16:
@@ -10453,7 +12246,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_bf16:
@@ -10462,7 +12261,11 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_bf16:
@@ -10472,7 +12275,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub bfloat %a, %b
   ret bfloat %op
@@ -10510,12 +12319,25 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v2bf16:
@@ -10523,11 +12345,23 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10540,7 +12374,17 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v2bf16:
@@ -10553,8 +12397,22 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -10601,15 +12459,34 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v3bf16:
@@ -10618,40 +12495,106 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <3 x bfloat> %a, %b
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+;
+; GFX11-LABEL: v_fsub_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
@@ -10696,20 +12639,46 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v4bf16:
@@ -10717,17 +12686,39 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -10738,17 +12729,37 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_sub_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v4bf16:
@@ -10760,15 +12771,40 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -10799,7 +12835,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_bf16:
@@ -10808,7 +12850,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_bf16:
@@ -10817,7 +12865,11 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_bf16:
@@ -10827,7 +12879,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul bfloat %a, %b
   ret bfloat %op
@@ -10865,12 +12923,25 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v2bf16:
@@ -10878,11 +12949,23 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10895,7 +12978,17 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v2bf16:
@@ -10908,8 +13001,22 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -10956,15 +13063,34 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v3bf16:
@@ -10973,32 +13099,98 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -11051,20 +13243,46 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v4bf16:
@@ -11072,17 +13290,39 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -11093,17 +13333,37 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v4bf16:
@@ -11115,15 +13375,40 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_mul_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11209,36 +13494,86 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v8bf16:
@@ -11246,29 +13581,71 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
@@ -11287,58 +13664,151 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_mul_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_mul_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_mul_f32_e32 v11, v12, v11
+; GFX10-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v11, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_dual_mul_f32 v10, v11, v10 :: v_dual_mul_f32 v11, v13, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v3, v3, v7
+; GFX11-NEXT:    v_mul_f32_e32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v12 :: v_dual_mul_f32 v1, v1, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v11, v12, v11
+; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -11492,68 +13962,166 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
 ; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v15, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, v18, v15
+; GFX8-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v14, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, v18, v14
+; GFX8-NEXT:    v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v14
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v13, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, v18, v13
+; GFX8-NEXT:    v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v13
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v12, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, v18, v12
+; GFX8-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v12
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, v18, v11
+; GFX8-NEXT:    v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v11
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v10, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, v18, v10
+; GFX8-NEXT:    v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v10
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, v18, v9
+; GFX8-NEXT:    v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v9
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v16, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v16bf16:
@@ -11561,53 +14129,135 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v18, v15, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v14, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v13, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v18, v12, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v18, v11, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v10, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
@@ -11625,119 +14275,294 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT:    v_mul_f32_e32 v15, v18, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT:    v_mul_f32_e32 v15, v17, v15
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_mul_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_mul_f32_e32 v14, v20, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX10-NEXT:    v_mul_f32_e32 v13, v21, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v17, v20, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT:    v_mul_f32_e32 v12, v19, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_mul_f32_e32 v11, v19, v18
+; GFX10-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT:    v_mul_f32_e32 v10, v21, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_mul_f32_e32 v19, v20, v19
-; GFX10-NEXT:    v_mul_f32_e32 v20, v22, v21
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT:    v_mul_f32_e32 v17, v19, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_mul_f32_e32 v13, v21, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v14, v20, v18 :: v_dual_mul_f32 v5, v5, v13
+; GFX11-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX11-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT:    v_mul_f32_e32 v14, v19, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT:    v_mul_f32_e32 v17, v20, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_mul_f32 v4, v4, v12
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v12, v19, v18
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX11-NEXT:    v_dual_mul_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v11, v19, v18
+; GFX11-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v19, v20, v19 :: v_dual_mul_f32 v20, v22, v21
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v17, v19, v17 :: v_dual_mul_f32 v2, v2, v10
+; GFX11-NEXT:    v_mul_f32_e32 v10, v21, v20
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -12145,247 +14970,716 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX8-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX8-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX8-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX8-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX8-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX8-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v35, s64, 32
+; GFX8-NEXT:    v_writelane_b32 v35, s65, 33
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_writelane_b32 v35, s66, 34
 ; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
 ; GFX8-NEXT:    v_mul_f32_e32 v30, v14, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s68, 36
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    s_movk_i32 s68, 0x7fff
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
 ; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_mul_f32_e32 v14, v32, v14
-; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_mul_f32_e32 v31, v31, v30
+; GFX8-NEXT:    v_mul_f32_e32 v29, v13, v29
+; GFX8-NEXT:    v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v31, v31, v29
+; GFX8-NEXT:    v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, v12, v28
+; GFX8-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT:    v_mul_f32_e32 v31, v15, v31
+; GFX8-NEXT:    v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT:    v_mul_f32_e32 v28, v28, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_mul_f32_e32 v34, v34, v31
+; GFX8-NEXT:    v_mul_f32_e32 v27, v11, v27
+; GFX8-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_mul_f32_e32 v34, v34, v27
+; GFX8-NEXT:    v_mul_f32_e32 v26, v10, v26
+; GFX8-NEXT:    v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_mul_f32_e32 v34, v34, v26
+; GFX8-NEXT:    v_mul_f32_e32 v25, v9, v25
+; GFX8-NEXT:    v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_mul_f32_e32 v34, v34, v25
+; GFX8-NEXT:    v_mul_f32_e32 v24, v8, v24
+; GFX8-NEXT:    v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v7
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_mul_f32_e32 v34, v34, v24
+; GFX8-NEXT:    v_mul_f32_e32 v23, v7, v23
+; GFX8-NEXT:    v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_mul_f32_e32 v34, v34, v23
 ; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v6, v34, v6
+; GFX8-NEXT:    v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, v21, v5
+; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v20, v4
+; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v19, v3
+; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, v18, v2
+; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v17, v1
+; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], v0, v28
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], s68, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT:    v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT:    v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT:    v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT:    v_readlane_b32 s68, v35, 36
+; GFX8-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX8-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX8-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX9-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX9-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX9-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX9-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX9-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX9-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX9-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX9-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX9-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v35, s64, 32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s65, 33
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v31
+; GFX9-NEXT:    v_mul_f32_e32 v30, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s66, 34
+; GFX9-NEXT:    s_movk_i32 s66, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT:    v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT:    v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v30
+; GFX9-NEXT:    v_mul_f32_e32 v29, v13, v29
+; GFX9-NEXT:    v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT:    v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT:    v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_mul_f32_e32 v29, v32, v29
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v29
+; GFX9-NEXT:    v_mul_f32_e32 v28, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT:    v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT:    v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_mul_f32_e32 v28, v32, v28
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v33
+; GFX9-NEXT:    v_mul_f32_e32 v27, v11, v27
+; GFX9-NEXT:    v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT:    v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT:    v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_mul_f32_e32 v27, v32, v27
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v27
+; GFX9-NEXT:    v_mul_f32_e32 v26, v10, v26
+; GFX9-NEXT:    v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT:    v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT:    v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_mul_f32_e32 v26, v32, v26
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v26
+; GFX9-NEXT:    v_mul_f32_e32 v25, v9, v25
+; GFX9-NEXT:    v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT:    v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT:    v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT:    v_mul_f32_e32 v25, v32, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v33
-; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v25
+; GFX9-NEXT:    v_mul_f32_e32 v24, v8, v24
+; GFX9-NEXT:    v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT:    v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT:    v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_mul_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v24
+; GFX9-NEXT:    v_mul_f32_e32 v23, v7, v23
+; GFX9-NEXT:    v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT:    v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT:    v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v23
+; GFX9-NEXT:    v_mul_f32_e32 v22, v6, v22
+; GFX9-NEXT:    v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT:    v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT:    v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[40:41], v22, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v22
+; GFX9-NEXT:    v_mul_f32_e32 v21, v5, v21
+; GFX9-NEXT:    v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT:    v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT:    v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[44:45], v21, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v21
+; GFX9-NEXT:    v_mul_f32_e32 v20, v4, v20
+; GFX9-NEXT:    v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT:    v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT:    v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[48:49], v20, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v20
+; GFX9-NEXT:    v_mul_f32_e32 v19, v3, v19
+; GFX9-NEXT:    v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT:    v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT:    v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[52:53], v19, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v19
+; GFX9-NEXT:    v_mul_f32_e32 v18, v2, v18
+; GFX9-NEXT:    v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT:    v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT:    v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[56:57], v18, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v18
+; GFX9-NEXT:    v_mul_f32_e32 v17, v1, v17
+; GFX9-NEXT:    v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT:    v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT:    v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[60:61], v17, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_mul_f32_e32 v16, v0, v16
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v34
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v17
+; GFX9-NEXT:    v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT:    v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
@@ -12394,25 +15688,64 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
 ; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
 ; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX9-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
@@ -12421,6 +15754,30 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
@@ -12433,7 +15790,73 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_mul_f32_e32 v19, v28, v38
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_mul_f32_e32 v18, v27, v48
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_mul_f32_e32 v17, v26, v50
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_mul_f32_e32 v20, v29, v36
+; GFX10-NEXT:    v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT:    v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT:    v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT:    v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT:    v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT:    v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT:    v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT:    v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT:    v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT:    v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT:    v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
@@ -12451,30 +15874,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT:    v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
 ; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
 ; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX10-NEXT:    v_mul_f32_e32 v25, v54, v53
@@ -12484,142 +15891,423 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_mul_f32_e32 v23, v66, v65
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
 ; GFX10-NEXT:    v_mul_f32_e32 v22, v68, v67
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_mul_f32_e32 v21, v30, v34
-; GFX10-NEXT:    v_mul_f32_e32 v29, v29, v36
-; GFX10-NEXT:    v_mul_f32_e32 v28, v28, v38
-; GFX10-NEXT:    v_mul_f32_e32 v27, v27, v48
-; GFX10-NEXT:    v_mul_f32_e32 v26, v26, v50
-; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT:    v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT:    v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT:    v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT:    v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT:    v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT:    v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT:    v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT:    v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT:    v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT:    v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT:    v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT:    v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT:    v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT:    v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT:    v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT:    v_mul_f32_e32 v17, v31, v33
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v32
+; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT:    v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
 ; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_mul_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_mul_f32 v4, v4, v20
+; GFX11-NEXT:    v_mul_f32_e32 v21, v70, v69
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_mul_f32 v1, v1, v17
+; GFX11-NEXT:    v_mul_f32_e32 v17, v86, v85
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_mul_f32_e32 v22, v68, v67
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_mul_f32_e32 v27, v50, v49
+; GFX11-NEXT:    v_bfe_u32 v50, v10, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v18 :: v_dual_mul_f32 v3, v3, v19
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX11-NEXT:    v_mul_f32_e32 v26, v52, v51
-; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_mul_f32_e32 v20, v80, v71
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_mul_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_dual_mul_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_dual_mul_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX11-NEXT:    v_mul_f32_e32 v25, v54, v53
-; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_mul_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_mul_f32_e32 v23, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_mul_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_mul_f32_e32 v29, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_mul_f32_e32 v37, v86, v85
-; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_mul_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
-; GFX11-NEXT:    v_dual_mul_f32 v34, v80, v71 :: v_dual_mul_f32 v35, v82, v81
-; GFX11-NEXT:    v_mul_f32_e32 v36, v84, v83
-; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT:    v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT:    v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT:    v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT:    v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT:    v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT:    v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT:    v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT:    v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT:    v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT:    v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT:    v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT:    v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT:    v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT:    v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT:    v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT:    v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT:    v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT:    v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT:    v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT:    v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT:    v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT:    v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT:    v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT:    v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT:    v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT:    v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT:    v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT:    v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT:    v_mul_f32_e32 v31, v31, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v32
+; GFX11-NEXT:    v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT:    v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <32 x bfloat> %a, %b
   ret <32 x bfloat> %op
@@ -12679,8 +16367,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_fma_f32 v5, v6, v4, v5
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc0
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_bf16:
@@ -12690,6 +16384,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
 ; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
@@ -12698,8 +16393,13 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
 ; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc0
 ; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_bf16:
@@ -12718,7 +16418,11 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_bf16:
@@ -12743,7 +16447,13 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fdiv bfloat %a, %b
   ret bfloat %op
@@ -13074,7 +16784,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_bf16:
@@ -13083,7 +16799,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_bf16:
@@ -13092,7 +16814,11 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_bf16:
@@ -13102,7 +16828,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
@@ -13148,12 +16880,25 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v2bf16:
@@ -13161,11 +16906,23 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13178,7 +16935,17 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v2bf16:
@@ -13191,8 +16958,22 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
@@ -13251,49 +17032,134 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v3bf16:
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -13362,20 +17228,46 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v4bf16:
@@ -13383,17 +17275,39 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13404,17 +17318,37 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_min_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v4bf16:
@@ -13426,15 +17360,40 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -13552,36 +17511,86 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v8bf16:
@@ -13589,29 +17598,71 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
@@ -13630,58 +17681,151 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_min_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_min_f32_e32 v11, v12, v11
+; GFX10-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT:    v_min_f32_e32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v12 :: v_dual_min_f32 v1, v1, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v11, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v11, v12, v11
+; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_dual_min_f32 v10, v11, v10 :: v_dual_min_f32 v11, v13, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -13899,68 +18043,166 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
 ; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v15, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_min_f32_e32 v15, v18, v15
+; GFX8-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v14, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_min_f32_e32 v14, v18, v14
+; GFX8-NEXT:    v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v14
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v13, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT:    v_min_f32_e32 v13, v18, v13
+; GFX8-NEXT:    v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v13
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v12, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v12, v18, v12
+; GFX8-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v12
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v11, v18, v11
+; GFX8-NEXT:    v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v11
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v10, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v10, v18, v10
+; GFX8-NEXT:    v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v10
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT:    v_min_f32_e32 v9, v18, v9
+; GFX8-NEXT:    v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v9
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v16, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v16bf16:
@@ -13968,53 +18210,135 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v18, v15, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v14, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v13, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v18, v12, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v18, v11, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v10, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
@@ -14032,119 +18356,294 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
 ; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT:    v_min_f32_e32 v15, v18, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT:    v_min_f32_e32 v15, v17, v15
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_min_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_min_f32_e32 v14, v20, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX10-NEXT:    v_min_f32_e32 v13, v21, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_min_f32_e32 v17, v20, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT:    v_min_f32_e32 v12, v19, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_min_f32_e32 v11, v19, v18
+; GFX10-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT:    v_min_f32_e32 v10, v21, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_min_f32_e32 v19, v20, v19
-; GFX10-NEXT:    v_min_f32_e32 v20, v22, v21
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT:    v_min_f32_e32 v17, v19, v17
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_min_f32_e32 v13, v21, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v14, v20, v18 :: v_dual_min_f32 v5, v5, v13
+; GFX11-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX11-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT:    v_min_f32_e32 v14, v19, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT:    v_min_f32_e32 v17, v20, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_min_f32 v4, v4, v12
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v12, v19, v18
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX11-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX11-NEXT:    v_dual_min_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v11, v19, v18
+; GFX11-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v19, v20, v19 :: v_dual_min_f32 v20, v22, v21
-; GFX11-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v17, v19, v17 :: v_dual_min_f32 v2, v2, v10
+; GFX11-NEXT:    v_min_f32_e32 v10, v21, v20
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -14680,247 +19179,716 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX8-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX8-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX8-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX8-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX8-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX8-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v35, s64, 32
+; GFX8-NEXT:    v_writelane_b32 v35, s65, 33
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_writelane_b32 v35, s66, 34
 ; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
 ; GFX8-NEXT:    v_min_f32_e32 v30, v14, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s68, 36
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    s_movk_i32 s68, 0x7fff
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
 ; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_min_f32_e32 v14, v32, v14
-; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_min_f32_e32 v31, v31, v30
+; GFX8-NEXT:    v_min_f32_e32 v29, v13, v29
+; GFX8-NEXT:    v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT:    v_min_f32_e32 v31, v31, v29
+; GFX8-NEXT:    v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_min_f32_e32 v28, v12, v28
+; GFX8-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT:    v_min_f32_e32 v31, v15, v31
+; GFX8-NEXT:    v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT:    v_min_f32_e32 v28, v28, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_min_f32_e32 v34, v34, v31
+; GFX8-NEXT:    v_min_f32_e32 v27, v11, v27
+; GFX8-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_min_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_min_f32_e32 v34, v34, v27
+; GFX8-NEXT:    v_min_f32_e32 v26, v10, v26
+; GFX8-NEXT:    v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_min_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_min_f32_e32 v34, v34, v26
+; GFX8-NEXT:    v_min_f32_e32 v25, v9, v25
+; GFX8-NEXT:    v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_min_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT:    v_min_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_min_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_min_f32_e32 v34, v34, v25
+; GFX8-NEXT:    v_min_f32_e32 v24, v8, v24
+; GFX8-NEXT:    v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v7
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_min_f32_e32 v34, v34, v24
+; GFX8-NEXT:    v_min_f32_e32 v23, v7, v23
+; GFX8-NEXT:    v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_min_f32_e32 v34, v34, v23
 ; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT:    v_min_f32_e32 v6, v34, v6
+; GFX8-NEXT:    v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v6
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT:    v_min_f32_e32 v5, v21, v5
+; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v4, v20, v4
+; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v3, v19, v3
+; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v2, v18, v2
+; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_min_f32_e32 v1, v17, v1
+; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], v0, v28
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], s68, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT:    v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT:    v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT:    v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT:    v_readlane_b32 s68, v35, 36
+; GFX8-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX8-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX8-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX9-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX9-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX9-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX9-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX9-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX9-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX9-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX9-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX9-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v35, s64, 32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s65, 33
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v31
+; GFX9-NEXT:    v_min_f32_e32 v30, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s66, 34
+; GFX9-NEXT:    s_movk_i32 s66, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT:    v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT:    v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v30
+; GFX9-NEXT:    v_min_f32_e32 v29, v13, v29
+; GFX9-NEXT:    v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT:    v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT:    v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_min_f32_e32 v29, v32, v29
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v29
+; GFX9-NEXT:    v_min_f32_e32 v28, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT:    v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT:    v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_min_f32_e32 v28, v32, v28
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v33
+; GFX9-NEXT:    v_min_f32_e32 v27, v11, v27
+; GFX9-NEXT:    v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT:    v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT:    v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_min_f32_e32 v27, v32, v27
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v27
+; GFX9-NEXT:    v_min_f32_e32 v26, v10, v26
+; GFX9-NEXT:    v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT:    v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT:    v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_min_f32_e32 v26, v32, v26
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v26
+; GFX9-NEXT:    v_min_f32_e32 v25, v9, v25
+; GFX9-NEXT:    v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT:    v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT:    v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT:    v_min_f32_e32 v25, v32, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_min_f32_e32 v32, v32, v33
-; GFX9-NEXT:    v_min_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v25
+; GFX9-NEXT:    v_min_f32_e32 v24, v8, v24
+; GFX9-NEXT:    v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT:    v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT:    v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v24
+; GFX9-NEXT:    v_min_f32_e32 v23, v7, v23
+; GFX9-NEXT:    v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT:    v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT:    v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v23
+; GFX9-NEXT:    v_min_f32_e32 v22, v6, v22
+; GFX9-NEXT:    v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT:    v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT:    v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[40:41], v22, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_min_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v22
+; GFX9-NEXT:    v_min_f32_e32 v21, v5, v21
+; GFX9-NEXT:    v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT:    v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT:    v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[44:45], v21, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_min_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v21
+; GFX9-NEXT:    v_min_f32_e32 v20, v4, v20
+; GFX9-NEXT:    v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT:    v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT:    v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[48:49], v20, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v20
+; GFX9-NEXT:    v_min_f32_e32 v19, v3, v19
+; GFX9-NEXT:    v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT:    v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT:    v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[52:53], v19, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v19
+; GFX9-NEXT:    v_min_f32_e32 v18, v2, v18
+; GFX9-NEXT:    v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT:    v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT:    v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[56:57], v18, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v18
+; GFX9-NEXT:    v_min_f32_e32 v17, v1, v17
+; GFX9-NEXT:    v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT:    v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT:    v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[60:61], v17, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v16, v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v34
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v17
+; GFX9-NEXT:    v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT:    v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
@@ -14929,13 +19897,72 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
 ; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
 ; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX9-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -14948,14 +19975,18 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
 ; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
@@ -14968,7 +19999,73 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_min_f32_e32 v19, v28, v38
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_min_f32_e32 v18, v27, v48
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_min_f32_e32 v17, v26, v50
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_min_f32_e32 v20, v29, v36
+; GFX10-NEXT:    v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT:    v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT:    v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT:    v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT:    v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT:    v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT:    v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT:    v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT:    v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT:    v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT:    v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
@@ -14986,30 +20083,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_min_f32_e32 v33, v34, v33
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT:    v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
 ; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
 ; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
 ; GFX10-NEXT:    v_min_f32_e32 v25, v54, v53
@@ -15019,142 +20100,423 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_min_f32_e32 v23, v66, v65
 ; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
 ; GFX10-NEXT:    v_min_f32_e32 v22, v68, v67
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_min_f32_e32 v21, v30, v34
-; GFX10-NEXT:    v_min_f32_e32 v29, v29, v36
-; GFX10-NEXT:    v_min_f32_e32 v28, v28, v38
-; GFX10-NEXT:    v_min_f32_e32 v27, v27, v48
-; GFX10-NEXT:    v_min_f32_e32 v26, v26, v50
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT:    v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT:    v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT:    v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT:    v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT:    v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT:    v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT:    v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT:    v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT:    v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT:    v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT:    v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT:    v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT:    v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT:    v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT:    v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT:    v_min_f32_e32 v17, v31, v33
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT:    v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT:    v_min_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
 ; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_min_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v4, v4, v20
+; GFX11-NEXT:    v_min_f32_e32 v21, v70, v69
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
+; GFX11-NEXT:    v_min_f32_e32 v17, v86, v85
+; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_min_f32_e32 v22, v68, v67
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_min_f32_e32 v27, v50, v49
+; GFX11-NEXT:    v_bfe_u32 v50, v10, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_min_f32 v2, v2, v18 :: v_dual_min_f32 v3, v3, v19
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX11-NEXT:    v_min_f32_e32 v26, v52, v51
-; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_min_f32_e32 v20, v80, v71
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_min_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_dual_min_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_dual_min_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT:    v_min_f32_e32 v9, v9, v25
 ; GFX11-NEXT:    v_min_f32_e32 v25, v54, v53
-; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_min_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_min_f32_e32 v23, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_min_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_min_f32_e32 v29, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_min_f32_e32 v37, v86, v85
-; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_min_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
-; GFX11-NEXT:    v_dual_min_f32 v34, v80, v71 :: v_dual_min_f32 v35, v82, v81
-; GFX11-NEXT:    v_min_f32_e32 v36, v84, v83
-; GFX11-NEXT:    v_dual_min_f32 v16, v32, v16 :: v_dual_min_f32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT:    v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT:    v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT:    v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT:    v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT:    v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT:    v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT:    v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT:    v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT:    v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT:    v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT:    v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT:    v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT:    v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT:    v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT:    v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT:    v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT:    v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT:    v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT:    v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT:    v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT:    v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT:    v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT:    v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT:    v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT:    v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT:    v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT:    v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT:    v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT:    v_min_f32_e32 v31, v31, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX11-NEXT:    v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT:    v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
   ret <32 x bfloat> %op
@@ -15198,7 +20560,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_bf16:
@@ -15207,7 +20575,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_bf16:
@@ -15216,7 +20590,11 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_bf16:
@@ -15226,7 +20604,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
@@ -15272,12 +20656,25 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v2bf16:
@@ -15285,11 +20682,23 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15302,7 +20711,17 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v2bf16:
@@ -15315,8 +20734,22 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
@@ -15375,15 +20808,34 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v3bf16:
@@ -15392,32 +20844,98 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -15486,20 +21004,46 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v4bf16:
@@ -15507,17 +21051,39 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -15528,17 +21094,37 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_max_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v4bf16:
@@ -15550,15 +21136,40 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -15676,36 +21287,86 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v8, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v8bf16:
@@ -15713,29 +21374,71 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v7, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
@@ -15754,58 +21457,151 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_max_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_max_f32_e32 v11, v12, v11
+; GFX10-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT:    v_max_f32_e32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v12 :: v_dual_max_f32 v1, v1, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v11, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v11, v12, v11
+; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_dual_max_f32 v10, v11, v10 :: v_dual_max_f32 v11, v13, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -16023,68 +21819,166 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
 ; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v15, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_max_f32_e32 v15, v18, v15
+; GFX8-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v14, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_max_f32_e32 v14, v18, v14
+; GFX8-NEXT:    v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v14
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v13, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT:    v_max_f32_e32 v13, v18, v13
+; GFX8-NEXT:    v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v13
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v12, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v12, v18, v12
+; GFX8-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v12
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v11, v18, v11
+; GFX8-NEXT:    v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v11
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v10, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v10, v18, v10
+; GFX8-NEXT:    v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v10
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT:    v_max_f32_e32 v9, v18, v9
+; GFX8-NEXT:    v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v9
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v16, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v16bf16:
@@ -16092,53 +21986,135 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v18, v15, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v14, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v13, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v18, v12, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v18, v11, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v10, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
@@ -16156,119 +22132,294 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
 ; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT:    v_max_f32_e32 v15, v18, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT:    v_max_f32_e32 v15, v17, v15
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_max_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_max_f32_e32 v14, v20, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX10-NEXT:    v_max_f32_e32 v13, v21, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_max_f32_e32 v17, v20, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT:    v_max_f32_e32 v12, v19, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v19, v18
+; GFX10-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT:    v_max_f32_e32 v10, v21, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v19, v20, v19
-; GFX10-NEXT:    v_max_f32_e32 v20, v22, v21
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT:    v_max_f32_e32 v17, v19, v17
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_max_f32_e32 v13, v21, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v14, v20, v18 :: v_dual_max_f32 v5, v5, v13
+; GFX11-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX11-NEXT:    v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT:    v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT:    v_max_f32_e32 v14, v19, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT:    v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT:    v_max_f32_e32 v17, v20, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_max_f32 v4, v4, v12
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v12, v19, v18
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX11-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX11-NEXT:    v_dual_max_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v11, v19, v18
+; GFX11-NEXT:    v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v19, v20, v19 :: v_dual_max_f32 v20, v22, v21
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v19, v17 :: v_dual_max_f32 v2, v2, v10
+; GFX11-NEXT:    v_max_f32_e32 v10, v21, v20
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -16804,247 +22955,716 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX8-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX8-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX8-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX8-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX8-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX8-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v35, s64, 32
+; GFX8-NEXT:    v_writelane_b32 v35, s65, 33
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_writelane_b32 v35, s66, 34
 ; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
 ; GFX8-NEXT:    v_max_f32_e32 v30, v14, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT:    v_writelane_b32 v35, s68, 36
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    s_movk_i32 s68, 0x7fff
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
 ; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_max_f32_e32 v14, v32, v14
-; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_max_f32_e32 v31, v31, v30
+; GFX8-NEXT:    v_max_f32_e32 v29, v13, v29
+; GFX8-NEXT:    v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT:    v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT:    v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT:    v_max_f32_e32 v31, v31, v29
+; GFX8-NEXT:    v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT:    v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_max_f32_e32 v28, v12, v28
+; GFX8-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT:    v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT:    v_max_f32_e32 v31, v15, v31
+; GFX8-NEXT:    v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT:    v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT:    v_max_f32_e32 v28, v28, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_max_f32_e32 v34, v34, v31
+; GFX8-NEXT:    v_max_f32_e32 v27, v11, v27
+; GFX8-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT:    v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT:    v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_max_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_max_f32_e32 v34, v34, v27
+; GFX8-NEXT:    v_max_f32_e32 v26, v10, v26
+; GFX8-NEXT:    v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT:    v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT:    v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_max_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_max_f32_e32 v34, v34, v26
+; GFX8-NEXT:    v_max_f32_e32 v25, v9, v25
+; GFX8-NEXT:    v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT:    v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT:    v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_max_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT:    v_max_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_max_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_max_f32_e32 v34, v34, v25
+; GFX8-NEXT:    v_max_f32_e32 v24, v8, v24
+; GFX8-NEXT:    v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT:    v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT:    v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v7
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_max_f32_e32 v34, v34, v24
+; GFX8-NEXT:    v_max_f32_e32 v23, v7, v23
+; GFX8-NEXT:    v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT:    v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT:    v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_max_f32_e32 v34, v34, v23
 ; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT:    v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT:    v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT:    v_max_f32_e32 v6, v34, v6
+; GFX8-NEXT:    v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT:    v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v6
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT:    v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT:    v_max_f32_e32 v5, v21, v5
+; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT:    v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT:    v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v4, v20, v4
+; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT:    v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT:    v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v3, v19, v3
+; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT:    v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT:    v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v2, v18, v2
+; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT:    v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT:    v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_max_f32_e32 v1, v17, v1
+; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT:    v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT:    v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], v0, v28
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_u32_e64 v0, s[66:67], s68, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT:    v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT:    v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT:    v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT:    v_readlane_b32 s68, v35, 36
+; GFX8-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX8-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX8-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT:    v_writelane_b32 v35, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v35, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v35, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v35, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v35, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v35, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v35, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v35, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v35, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v35, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v35, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v35, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v35, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v35, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v35, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v35, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v35, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v35, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v35, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v35, s51, 19
+; GFX9-NEXT:    v_writelane_b32 v35, s52, 20
+; GFX9-NEXT:    v_writelane_b32 v35, s53, 21
+; GFX9-NEXT:    v_writelane_b32 v35, s54, 22
+; GFX9-NEXT:    v_writelane_b32 v35, s55, 23
+; GFX9-NEXT:    v_writelane_b32 v35, s56, 24
+; GFX9-NEXT:    v_writelane_b32 v35, s57, 25
+; GFX9-NEXT:    v_writelane_b32 v35, s58, 26
+; GFX9-NEXT:    v_writelane_b32 v35, s59, 27
+; GFX9-NEXT:    v_writelane_b32 v35, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v35, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v35, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v35, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v35, s64, 32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s65, 33
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v31
+; GFX9-NEXT:    v_max_f32_e32 v30, v14, v30
+; GFX9-NEXT:    v_writelane_b32 v35, s66, 34
+; GFX9-NEXT:    s_movk_i32 s66, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT:    v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT:    v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v30, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v30
+; GFX9-NEXT:    v_max_f32_e32 v29, v13, v29
+; GFX9-NEXT:    v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT:    v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT:    v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v29, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_max_f32_e32 v29, v32, v29
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v29
+; GFX9-NEXT:    v_max_f32_e32 v28, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT:    v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT:    v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v28, v28
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_max_f32_e32 v28, v32, v28
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v33
+; GFX9-NEXT:    v_max_f32_e32 v27, v11, v27
+; GFX9-NEXT:    v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT:    v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT:    v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[18:19], v27, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_max_f32_e32 v27, v32, v27
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v27
+; GFX9-NEXT:    v_max_f32_e32 v26, v10, v26
+; GFX9-NEXT:    v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT:    v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT:    v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[22:23], v26, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v26, v32, v26
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v26
+; GFX9-NEXT:    v_max_f32_e32 v25, v9, v25
+; GFX9-NEXT:    v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT:    v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT:    v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[26:27], v25, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT:    v_max_f32_e32 v25, v32, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v33
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v25
+; GFX9-NEXT:    v_max_f32_e32 v24, v8, v24
+; GFX9-NEXT:    v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT:    v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT:    v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[30:31], v24, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v24
+; GFX9-NEXT:    v_max_f32_e32 v23, v7, v23
+; GFX9-NEXT:    v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT:    v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT:    v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[36:37], v23, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v23
+; GFX9-NEXT:    v_max_f32_e32 v22, v6, v22
+; GFX9-NEXT:    v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT:    v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT:    v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[40:41], v22, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v22
+; GFX9-NEXT:    v_max_f32_e32 v21, v5, v21
+; GFX9-NEXT:    v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT:    v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT:    v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[44:45], v21, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v21
+; GFX9-NEXT:    v_max_f32_e32 v20, v4, v20
+; GFX9-NEXT:    v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT:    v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT:    v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[48:49], v20, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v20
+; GFX9-NEXT:    v_max_f32_e32 v19, v3, v19
+; GFX9-NEXT:    v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT:    v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT:    v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[52:53], v19, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v19
+; GFX9-NEXT:    v_max_f32_e32 v18, v2, v18
+; GFX9-NEXT:    v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT:    v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT:    v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[56:57], v18, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v18
+; GFX9-NEXT:    v_max_f32_e32 v17, v1, v17
+; GFX9-NEXT:    v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT:    v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT:    v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[60:61], v17, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v16, v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v34
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v17
+; GFX9-NEXT:    v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT:    v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT:    v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT:    v_writelane_b32 v35, s67, 35
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
@@ -17053,13 +23673,72 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
 ; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
 ; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v32bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT:    v_readlane_b32 s67, v35, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v35, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v35, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v35, 32
+; GFX9-NEXT:    v_readlane_b32 s63, v35, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v35, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v35, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v35, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v35, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v35, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v35, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v35, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v35, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v35, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v35, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v35, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v35, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v35, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v35, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v35, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v35, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v35, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v35, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v35, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v35, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v35, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v35, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v35, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v35, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v35, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v35, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v35, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v35, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v35, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v35, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v35, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -17072,14 +23751,18 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
 ; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
@@ -17092,7 +23775,73 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_max_f32_e32 v19, v28, v38
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_max_f32_e32 v18, v27, v48
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_max_f32_e32 v17, v26, v50
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v29, v36
+; GFX10-NEXT:    v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT:    v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT:    v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT:    v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT:    v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT:    v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT:    v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT:    v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT:    v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT:    v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT:    v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
@@ -17110,30 +23859,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v33, v34, v33
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT:    v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
 ; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
 ; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
 ; GFX10-NEXT:    v_max_f32_e32 v25, v54, v53
@@ -17143,142 +23876,423 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_max_f32_e32 v23, v66, v65
 ; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
 ; GFX10-NEXT:    v_max_f32_e32 v22, v68, v67
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_max_f32_e32 v21, v30, v34
-; GFX10-NEXT:    v_max_f32_e32 v29, v29, v36
-; GFX10-NEXT:    v_max_f32_e32 v28, v28, v38
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v48
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v50
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT:    v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT:    v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT:    v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT:    v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT:    v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT:    v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT:    v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT:    v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT:    v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT:    v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT:    v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT:    v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT:    v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT:    v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT:    v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT:    v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT:    v_max_f32_e32 v17, v31, v33
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT:    v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT:    v_max_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
 ; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_max_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v4, v4, v20
+; GFX11-NEXT:    v_max_f32_e32 v21, v70, v69
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
+; GFX11-NEXT:    v_max_f32_e32 v17, v86, v85
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_max_f32_e32 v22, v68, v67
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_max_f32_e32 v27, v50, v49
+; GFX11-NEXT:    v_bfe_u32 v50, v10, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v18 :: v_dual_max_f32 v3, v3, v19
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX11-NEXT:    v_max_f32_e32 v26, v52, v51
-; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_max_f32_e32 v20, v80, v71
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_dual_max_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_dual_max_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT:    v_max_f32_e32 v9, v9, v25
 ; GFX11-NEXT:    v_max_f32_e32 v25, v54, v53
-; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_max_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_max_f32_e32 v23, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_max_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_max_f32_e32 v29, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_max_f32_e32 v37, v86, v85
-; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_max_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
-; GFX11-NEXT:    v_dual_max_f32 v34, v80, v71 :: v_dual_max_f32 v35, v82, v81
-; GFX11-NEXT:    v_max_f32_e32 v36, v84, v83
-; GFX11-NEXT:    v_dual_max_f32 v16, v32, v16 :: v_dual_max_f32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT:    v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT:    v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT:    v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT:    v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT:    v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT:    v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT:    v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT:    v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT:    v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT:    v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT:    v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT:    v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT:    v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT:    v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT:    v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT:    v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT:    v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT:    v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT:    v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT:    v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT:    v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT:    v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT:    v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT:    v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT:    v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT:    v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT:    v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT:    v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT:    v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT:    v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT:    v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT:    v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT:    v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT:    v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT:    v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT:    v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT:    v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT:    v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT:    v_max_f32_e32 v31, v31, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX11-NEXT:    v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT:    v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
   ret <32 x bfloat> %op
@@ -17359,7 +24373,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sqrt_bf16:
@@ -17384,7 +24404,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sqrt_bf16:
@@ -17407,7 +24433,11 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sqrt_bf16:
@@ -17437,8 +24467,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
   ret bfloat %op
@@ -17468,7 +24503,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ldexp_bf16_i32:
@@ -17476,7 +24517,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ldexp_bf16_i32:
@@ -17484,7 +24531,11 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ldexp_bf16_i32:
@@ -17493,7 +24544,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
   ret bfloat %op
@@ -17527,10 +24584,16 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX8-LABEL: v_frexp_bf16_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_frexp_bf16_i16:
@@ -17538,7 +24601,13 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17548,7 +24617,11 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
   ret { bfloat, i16 } %op
@@ -17638,7 +24711,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log_bf16:
@@ -17663,7 +24742,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log_bf16:
@@ -17682,7 +24767,11 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log_bf16:
@@ -17706,8 +24795,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log.bf16(bfloat %a)
   ret bfloat %op
@@ -17758,8 +24852,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log2_bf16:
@@ -17774,8 +24874,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log2_bf16:
@@ -17788,7 +24894,11 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log2_bf16:
@@ -17804,7 +24914,13 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log2.bf16(bfloat %a)
   ret bfloat %op
@@ -17889,7 +25005,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log10_bf16:
@@ -17914,7 +25036,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log10_bf16:
@@ -17933,7 +25061,11 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log10_bf16:
@@ -17957,8 +25089,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log10.bf16(bfloat %a)
   ret bfloat %op
@@ -18045,7 +25182,13 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp_bf16:
@@ -18070,7 +25213,13 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp_bf16:
@@ -18090,7 +25239,11 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp_bf16:
@@ -18115,8 +25268,13 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp.bf16(bfloat %a)
   ret bfloat %op
@@ -18167,8 +25325,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp2_bf16:
@@ -18183,8 +25347,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp2_bf16:
@@ -18197,7 +25367,11 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp2_bf16:
@@ -18213,7 +25387,13 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp2.bf16(bfloat %a)
   ret bfloat %op
@@ -18296,7 +25476,13 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp10_bf16:
@@ -18321,7 +25507,13 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp10_bf16:
@@ -18341,7 +25533,11 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp10_bf16:
@@ -18366,8 +25562,13 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp10.bf16(bfloat %a)
   ret bfloat %op
@@ -18397,7 +25598,13 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ceil_bf16:
@@ -18405,7 +25612,13 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ceil_bf16:
@@ -18413,7 +25626,11 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ceil_bf16:
@@ -18422,7 +25639,13 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.ceil.bf16(bfloat %a)
   ret bfloat %op
@@ -18452,7 +25675,13 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_trunc_bf16:
@@ -18460,7 +25689,13 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_trunc_bf16:
@@ -18468,7 +25703,11 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_trunc_bf16:
@@ -18477,7 +25716,13 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.trunc.bf16(bfloat %a)
   ret bfloat %op
@@ -18507,7 +25752,13 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rint_bf16:
@@ -18515,7 +25766,13 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rint_bf16:
@@ -18523,7 +25780,11 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rint_bf16:
@@ -18532,7 +25793,13 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.rint.bf16(bfloat %a)
   ret bfloat %op
@@ -18562,7 +25829,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_nearbyint_bf16:
@@ -18570,7 +25843,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_nearbyint_bf16:
@@ -18578,7 +25857,11 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_nearbyint_bf16:
@@ -18587,7 +25870,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
   ret bfloat %op
@@ -18635,7 +25924,13 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_round_bf16:
@@ -18649,7 +25944,13 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_round_bf16:
@@ -18662,7 +25963,11 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_round_bf16:
@@ -18678,8 +25983,13 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.round.bf16(bfloat %a)
   ret bfloat %op
@@ -18709,7 +26019,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_bf16:
@@ -18717,7 +26033,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_bf16:
@@ -18725,7 +26047,11 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_roundeven_bf16:
@@ -18734,7 +26060,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
   ret bfloat %op
@@ -18764,7 +26096,13 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_floor_f32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_floor_bf16:
@@ -18772,7 +26110,13 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_floor_f32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_floor_bf16:
@@ -18780,7 +26124,11 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_floor_f32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_floor_bf16:
@@ -18789,7 +26137,13 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_floor_f32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.floor.bf16(bfloat %a)
   ret bfloat %op
@@ -18813,7 +26167,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_canonicalize_bf16:
@@ -18821,7 +26181,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_canonicalize_bf16:
@@ -18829,7 +26195,11 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_canonicalize_bf16:
@@ -18838,7 +26208,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
   ret bfloat %op
@@ -22198,21 +29574,37 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i16_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i16_to_bf16:
@@ -22221,7 +29613,13 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i16 %x to bfloat
   ret bfloat %op
@@ -22255,37 +29653,86 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i16> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -22325,32 +29772,116 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -22394,54 +29925,147 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v0, v3, v0, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v5, s4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_add3_u32 v5, v5, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX11-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -22466,29 +30090,50 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i32_to_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i32 %x to bfloat
   ret bfloat %op
@@ -22516,10 +30161,23 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
@@ -22527,7 +30185,19 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -22536,7 +30206,17 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
@@ -22544,8 +30224,22 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i32> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -22578,22 +30272,58 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -22602,9 +30332,53 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -22639,24 +30413,72 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -22668,8 +30490,28 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
@@ -22679,9 +30521,34 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i32> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -22738,8 +30605,14 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i64_to_bf16:
@@ -22752,12 +30625,18 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
 ; GFX9-NEXT:    v_min_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i64_to_bf16:
@@ -22775,7 +30654,11 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i64_to_bf16:
@@ -22798,7 +30681,13 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i64 %x to bfloat
   ret bfloat %op
@@ -22877,25 +30766,39 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
 ; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v4, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v5, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
 ; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v1, v5, v2
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
@@ -22908,24 +30811,36 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
 ; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v4, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v0, v0, v4, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-NEXT:    v_min_u32_e32 v6, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -22956,7 +30871,17 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
@@ -22995,8 +30920,22 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i64> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -23101,38 +31040,57 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
 ; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v6, v4, v5
-; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v7, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    v_xor_b32_e32 v5, v2, v3
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7fc0
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, v2, v3
+; GFX8-NEXT:    v_ffbh_i32_e32 v5, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, -1, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 32, v6
+; GFX8-NEXT:    v_min_u32_e32 v5, v5, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
 ; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
 ; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -23148,79 +31106,182 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
+; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc0
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, 32, v6
+; GFX9-NEXT:    v_xor_b32_e32 v6, v2, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT:    v_ffbh_i32_e32 v5, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX9-NEXT:    v_add_u32_e32 v5, -1, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_min_u32_e32 v5, v5, v6
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
+; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v5
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v7, v0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX10-NEXT:    v_xor_b32_e32 v9, v2, v3
-; GFX10-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX10-NEXT:    v_xor_b32_e32 v6, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v8, v2, v3
 ; GFX10-NEXT:    v_ffbh_i32_e32 v10, v1
-; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX10-NEXT:    v_xor_b32_e32 v9, v4, v5
 ; GFX10-NEXT:    v_ffbh_i32_e32 v11, v3
-; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v7, v5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 32, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_min_u32_e32 v7, v10, v7
-; GFX10-NEXT:    v_min_u32_e32 v9, v11, v9
-; GFX10-NEXT:    v_min_u32_e32 v6, v6, v8
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX10-NEXT:    v_min_u32_e32 v6, v10, v6
+; GFX10-NEXT:    v_min_u32_e32 v8, v11, v8
+; GFX10-NEXT:    v_min_u32_e32 v7, v7, v9
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v4
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX10-NEXT:    v_add3_u32 v3, v4, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v5, v1, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_xor_b32_e32 v8, v2, v3
+; GFX11-NEXT:    v_cls_i32_e32 v10, v1
+; GFX11-NEXT:    v_cls_i32_e32 v11, v3
+; GFX11-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11-NEXT:    v_cls_i32_e32 v7, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; GFX11-NEXT:    v_min_u32_e32 v6, v10, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v8, v11, v8
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_min_u32_e32 v7, v7, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v4
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX11-NEXT:    v_add3_u32 v3, v4, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v4, v5, v1, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -23350,51 +31411,77 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
 ; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v8, v4, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v5, v6, v7
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v9, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX8-NEXT:    v_ffbh_i32_e32 v4, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
 ; GFX8-NEXT:    v_min_u32_e32 v10, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v8, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT:    v_ldexp_f32 v5, v9, v6
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
-; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
-; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
-; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
-; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
+; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v10
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v7
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v7, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v0, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v0
 ; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT:    v_min_u32_e32 v9, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v1, v7, v2
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v9
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -23407,50 +31494,72 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
 ; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v8, v4, v5
+; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v8, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
 ; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v8, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
+; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v7, 32, v10
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v7, v7, v4, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT:    v_ldexp_f32 v7, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v0, v7, 16, 1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_add3_u32 v8, v0, v7, s4
 ; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
+; GFX9-NEXT:    v_min_u32_e32 v9, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v9
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v5, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -23458,18 +31567,18 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
 ; GFX10-NEXT:    v_ffbh_i32_e32 v9, v5
-; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
 ; GFX10-NEXT:    v_xor_b32_e32 v13, v0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v14, v2, v3
+; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v12, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX10-NEXT:    v_xor_b32_e32 v14, v2, v3
 ; GFX10-NEXT:    v_ffbh_i32_e32 v10, v7
-; GFX10-NEXT:    v_ffbh_i32_e32 v12, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
 ; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
@@ -23483,29 +31592,49 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_min_u32_e32 v9, v12, v9
 ; GFX10-NEXT:    v_min_u32_e32 v11, v13, v14
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
 ; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX10-NEXT:    v_min_u32_e32 v5, 1, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v5
+; GFX10-NEXT:    v_ldexp_f32 v3, v4, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_ldexp_f32 v3, v4, v7
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v11
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -23513,18 +31642,18 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
 ; GFX11-NEXT:    v_cls_i32_e32 v9, v5
-; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
 ; GFX11-NEXT:    v_xor_b32_e32 v13, v0, v1
-; GFX11-NEXT:    v_xor_b32_e32 v14, v2, v3
+; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX11-NEXT:    v_cls_i32_e32 v12, v1
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX11-NEXT:    v_xor_b32_e32 v14, v2, v3
 ; GFX11-NEXT:    v_cls_i32_e32 v10, v7
-; GFX11-NEXT:    v_cls_i32_e32 v12, v1
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
 ; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
 ; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
 ; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
@@ -23537,37 +31666,61 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
 ; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
 ; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX11-NEXT:    v_min_u32_e32 v5, 1, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v3, v4, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX11-NEXT:    v_or_b32_e32 v1, v7, v5
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX11-NEXT:    v_ldexp_f32 v3, v4, v7
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v11
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -23594,21 +31747,37 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i16_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i16_to_bf16:
@@ -23617,7 +31786,13 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i16 %x to bfloat
   ret bfloat %op
@@ -23649,10 +31824,23 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -23660,7 +31848,19 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23669,7 +31869,17 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -23680,8 +31890,22 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i16> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -23719,23 +31943,58 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -23744,9 +32003,58 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -23789,24 +32097,72 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v0, v3, v0, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v5, s4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_add3_u32 v5, v5, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -23818,8 +32174,28 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -23835,9 +32211,34 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -23862,29 +32263,50 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i32_to_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i32 %x to bfloat
   ret bfloat %op
@@ -23912,10 +32334,23 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -23923,7 +32358,19 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23932,7 +32379,17 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -23940,8 +32397,22 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i32> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -23974,22 +32445,58 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -23998,9 +32505,53 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24035,24 +32586,72 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -24064,8 +32663,28 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -24075,9 +32694,34 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i32> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -24122,8 +32766,14 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i64_to_bf16:
@@ -24132,12 +32782,18 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX9-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v2
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i64_to_bf16:
@@ -24151,7 +32807,11 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i64_to_bf16:
@@ -24168,8 +32828,13 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i64 %x to bfloat
   ret bfloat %op
@@ -24230,19 +32895,32 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v4, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
 ; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v1, v5, v2
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
@@ -24251,20 +32929,32 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v4, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -24287,7 +32977,17 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
@@ -24315,8 +33015,22 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i64> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -24393,30 +33107,49 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_ffbh_u32_e32 v6, v5
 ; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v6, v4, v5
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
-; GFX8-NEXT:    v_min_u32_e32 v7, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v4
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
-; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7fc0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX8-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
 ; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
 ; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -24425,30 +33158,46 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
-; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc0
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX9-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
+; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    v_sub_u32_e32 v4, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v5
+; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -24463,24 +33212,96 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v4
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v8
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX10-NEXT:    v_add3_u32 v3, v4, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v5, v1, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v6, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v7, v3
+; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX11-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v3, v4, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v5, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24574,39 +33395,65 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v8, v4, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX8-NEXT:    v_ffbh_u32_e32 v4, v7
 ; GFX8-NEXT:    v_min_u32_e32 v10, 32, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX8-NEXT:    v_ldexp_f32 v5, v9, v6
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
-; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
-; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
-; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v10
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v7
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v7, v0, v1
+; GFX8-NEXT:    v_bfe_u32 v0, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v7
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT:    v_min_u32_e32 v9, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v1, v7, v2
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v9
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
@@ -24615,86 +33462,128 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v8, v4, v5
+; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
 ; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v1
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v7, 32, v10
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT:    v_add3_u32 v7, v7, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT:    v_ldexp_f32 v7, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v0, v7, 16, 1
+; GFX9-NEXT:    v_add3_u32 v8, v0, v7, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
+; GFX9-NEXT:    v_min_u32_e32 v9, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v9
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v5, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
-; GFX10-NEXT:    v_ffbh_u32_e32 v9, v1
-; GFX10-NEXT:    v_ffbh_u32_e32 v10, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v11, v7
+; GFX10-NEXT:    v_ffbh_u32_e32 v9, v7
+; GFX10-NEXT:    v_ffbh_u32_e32 v10, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v11, v3
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, v7, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v11
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v9
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, v8
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT:    v_ldexp_f32 v4, v4, v8
+; GFX10-NEXT:    v_ldexp_f32 v1, v4, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX10-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT:    v_ldexp_f32 v1, v3, v1
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
-; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v1
-; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
+; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v7
+; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
@@ -24703,36 +33592,61 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
 ; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v11
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v9
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v4
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, v8
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    v_ldexp_f32 v4, v4, v8
+; GFX11-NEXT:    v_ldexp_f32 v1, v4, v1
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX11-NEXT:    v_ldexp_f32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -29207,7 +38121,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_bf16:
@@ -29217,7 +38137,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_bf16:
@@ -29227,7 +38153,11 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_bf16:
@@ -29238,7 +38168,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %op
@@ -29281,13 +38217,26 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2bf16:
@@ -29296,12 +38245,24 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_fma_f32 v3, v5, v4, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29316,7 +38277,17 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v4
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v0, v3, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2bf16:
@@ -29330,7 +38301,21 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v0, v3, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %op
@@ -29384,17 +38369,36 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_fma_f32 v3, v7, v6, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f32 v3, v6, v5, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v3bf16:
@@ -29404,37 +38408,106 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_fma_f32 v3, v7, v6, v3
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX9-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v0, v3
+; GFX10-NEXT:    v_add3_u32 v0, v1, v6, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v1, v2, v4, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v5, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fma_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v0, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v0, v1, v6, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v1, v2, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_add3_u32 v2, v2, v5, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -29496,23 +38569,49 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v4bf16:
@@ -29521,20 +38620,42 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -29545,45 +38666,94 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v5, v6, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v0, v7, 16, 1
+; GFX10-NEXT:    v_add3_u32 v1, v8, v6, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX10-NEXT:    v_add3_u32 v0, v0, v7, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v5, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_lshlrev_b32 v6, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_dual_fmac_f32 v4, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v9, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX11-NEXT:    v_perm_b32 v1, v5, v6, 0x7060302
+; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v5, 0x7fff
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v9, v8
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v0, v7, 16, 1
+; GFX11-NEXT:    v_add3_u32 v1, v8, v6, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v0, v0, v7, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
@@ -29625,10 +38795,22 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_bf16:
@@ -29637,10 +38819,21 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_bf16:
@@ -29649,10 +38842,19 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_bf16:
@@ -29661,11 +38863,24 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0x7fc0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %op
@@ -29715,18 +38930,44 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v2bf16:
@@ -29734,17 +38975,39 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29757,13 +39020,33 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v2bf16:
@@ -29772,18 +39055,45 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0x7fc0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %op
@@ -29848,24 +39158,61 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v3bf16:
@@ -29874,50 +39221,175 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v6, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v7, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_add3_u32 v3, v4, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v4, v5, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmuladd_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v3, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0x7fc0, v3 :: v_dual_lshlrev_b32 v3, 16, v4
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v3, 16, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, 0x7fc0, v6 :: v_dual_add_f32 v0, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_add3_u32 v3, v4, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v4, v5, v0, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -29994,32 +39466,82 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v4bf16:
@@ -30027,29 +39549,71 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -30059,64 +39623,152 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v7, v9, v8
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_mul_f32_e32 v3, v8, v7
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_bfe_u32 v2, v7, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_add3_u32 v3, v8, v6, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v7, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_add3_u32 v5, v6, v3, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v2, v6, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX11-NEXT:    v_mul_f32_e32 v3, v8, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v7, v9, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v2, v7, 16, 1
+; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v8, v6, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT:    v_add3_u32 v2, v2, v7, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0x7fc0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, 0x7fc0, v6 :: v_dual_lshlrev_b32 v6, 16, v4
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0x7fc0, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v5, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index a69fb35f8f0cb0..63fa5c5ac18020 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -787,7 +787,13 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
@@ -796,8 +802,14 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 490167ee3635a3..73655161ce86cf 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -1504,106 +1504,127 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
 define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_bf16_agent:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX900-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; GFX900-NEXT:    s_mov_b64 s[0:1], 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_and_b32 s2, s4, -4
-; GFX900-NEXT:    s_mov_b32 s3, s5
-; GFX900-NEXT:    s_load_dword s6, s[2:3], 0x0
-; GFX900-NEXT:    s_and_b32 s4, s4, 3
-; GFX900-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX900-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX900-NEXT:    s_not_b32 s5, s5
+; GFX900-NEXT:    s_and_b32 s2, s6, -4
+; GFX900-NEXT:    s_mov_b32 s3, s7
+; GFX900-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX900-NEXT:    s_and_b32 s5, s6, 3
+; GFX900-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX900-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX900-NEXT:    s_not_b32 s6, s6
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX900-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX900-NEXT:    v_and_or_b32 v1, v2, s5, v1
-; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX900-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; GFX900-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, s5, v2
+; GFX900-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX900-NEXT:    global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX900-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX900-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
 ; GFX900-NEXT:    global_store_short v[0:1], v0, off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_ret_bf16_agent:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; GFX908-NEXT:    s_mov_b64 s[0:1], 0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_and_b32 s2, s4, -4
-; GFX908-NEXT:    s_mov_b32 s3, s5
-; GFX908-NEXT:    s_load_dword s6, s[2:3], 0x0
-; GFX908-NEXT:    s_and_b32 s4, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX908-NEXT:    s_not_b32 s5, s5
+; GFX908-NEXT:    s_and_b32 s2, s6, -4
+; GFX908-NEXT:    s_mov_b32 s3, s7
+; GFX908-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX908-NEXT:    s_and_b32 s5, s6, 3
+; GFX908-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX908-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX908-NEXT:    s_not_b32 s6, s6
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v2, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s5, v1
-; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; GFX908-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX908-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX908-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, s5, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
 ; GFX908-NEXT:    global_store_short v[0:1], v0, off
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_atomic_fadd_ret_bf16_agent:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s2, s4, -4
-; GFX90A-NEXT:    s_mov_b32 s3, s5
-; GFX90A-NEXT:    s_load_dword s6, s[2:3], 0x0
-; GFX90A-NEXT:    s_and_b32 s4, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX90A-NEXT:    s_not_b32 s5, s5
+; GFX90A-NEXT:    s_and_b32 s2, s6, -4
+; GFX90A-NEXT:    s_mov_b32 s3, s7
+; GFX90A-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX90A-NEXT:    s_and_b32 s5, s6, 3
+; GFX90A-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX90A-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX90A-NEXT:    s_not_b32 s6, s6
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s5, v1
-; GFX90A-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; GFX90A-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX90A-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s5, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
 ; GFX90A-NEXT:    global_store_short v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -1627,7 +1648,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
 ; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1658,13 +1684,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s2, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
 ; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
@@ -1690,108 +1721,129 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_bf16_system:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX900-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; GFX900-NEXT:    s_mov_b64 s[0:1], 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_and_b32 s2, s4, -4
-; GFX900-NEXT:    s_mov_b32 s3, s5
-; GFX900-NEXT:    s_load_dword s6, s[2:3], 0x0
-; GFX900-NEXT:    s_and_b32 s4, s4, 3
-; GFX900-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX900-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX900-NEXT:    s_not_b32 s5, s5
+; GFX900-NEXT:    s_and_b32 s2, s6, -4
+; GFX900-NEXT:    s_mov_b32 s3, s7
+; GFX900-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX900-NEXT:    s_and_b32 s5, s6, 3
+; GFX900-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX900-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX900-NEXT:    s_not_b32 s6, s6
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX900-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX900-NEXT:    v_and_or_b32 v1, v2, s5, v1
-; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX900-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; GFX900-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, s5, v2
+; GFX900-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX900-NEXT:    global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX900-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX900-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
 ; GFX900-NEXT:    global_store_short v[0:1], v0, off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_ret_bf16_system:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; GFX908-NEXT:    s_mov_b64 s[0:1], 0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_and_b32 s2, s4, -4
-; GFX908-NEXT:    s_mov_b32 s3, s5
-; GFX908-NEXT:    s_load_dword s6, s[2:3], 0x0
-; GFX908-NEXT:    s_and_b32 s4, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX908-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX908-NEXT:    s_not_b32 s5, s5
+; GFX908-NEXT:    s_and_b32 s2, s6, -4
+; GFX908-NEXT:    s_mov_b32 s3, s7
+; GFX908-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX908-NEXT:    s_and_b32 s5, s6, 3
+; GFX908-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX908-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX908-NEXT:    s_not_b32 s6, s6
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v2, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v1, v2, s5, v1
-; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; GFX908-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX908-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX908-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX908-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, s5, v2
+; GFX908-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT:    global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
 ; GFX908-NEXT:    global_store_short v[0:1], v0, off
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_atomic_fadd_ret_bf16_system:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s2, s4, -4
-; GFX90A-NEXT:    s_mov_b32 s3, s5
-; GFX90A-NEXT:    s_load_dword s6, s[2:3], 0x0
-; GFX90A-NEXT:    s_and_b32 s4, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX90A-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX90A-NEXT:    s_not_b32 s5, s5
+; GFX90A-NEXT:    s_and_b32 s2, s6, -4
+; GFX90A-NEXT:    s_mov_b32 s3, s7
+; GFX90A-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX90A-NEXT:    s_and_b32 s5, s6, 3
+; GFX90A-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX90A-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX90A-NEXT:    s_not_b32 s6, s6
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v2, v3, s5, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; GFX90A-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX90A-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s5, v2
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s6, v2
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s5, v2
 ; GFX90A-NEXT:    global_store_short v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -1815,7 +1867,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
 ; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1846,13 +1903,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s2, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
 ; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 78db126fb2dc49..f494e5d7ab6bb9 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -3,10 +3,10 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
 
 ; We only care about which physical registers the parameters are copied from;
 ; the function bodies are just some arbitrary uses.
@@ -64,59 +64,113 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre
   ; GISEL-GFX10-NEXT:   FLAT_STORE_DWORDX4 [[COPY12]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into `ptr poison`)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
-  ; DAGISEL-GFX11-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
-  ; DAGISEL-GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %c = add <4 x i32> %a, %b
   store <4 x i32> %c, ptr poison
   ret void
@@ -183,81 +237,157 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_ptr(ptr inreg
   ; GISEL-GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY15]], [[COPY11]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (p5) into %ir.b5, addrspace 5)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
-  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX11-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
-  ; DAGISEL-GFX11-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
-  ; DAGISEL-GFX11-NEXT:   DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
-  ; DAGISEL-GFX11-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
-  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX10-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
-  ; DAGISEL-GFX10-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
-  ; DAGISEL-GFX10-NEXT:   DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
-  ; DAGISEL-GFX10-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-  ; DAGISEL-GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-WF32-NEXT:   DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-WF32-NEXT:   SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-WF64-NEXT:   DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-WF64-NEXT:   SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-WF32-NEXT:   DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-WF32-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-WF64-NEXT:   DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-WF64-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   store ptr %a, ptr %b
   store ptr addrspace(1) %a1, ptr addrspace(1) %b1
   store ptr addrspace(3) %a3, ptr addrspace(3) %b3
@@ -346,119 +476,233 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr,
   ; GISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY22]], [[REG_SEQUENCE3]], 16, 0, implicit $exec :: (store (<4 x s32>) into `ptr addrspace(1) poison` + 16, addrspace 1)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
-  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
-  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; DAGISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; DAGISEL-GFX11-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX11-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX11-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX11-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
-  ; DAGISEL-GFX11-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
-  ; DAGISEL-GFX11-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
-  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
-  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; DAGISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX10-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX10-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX10-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
-  ; DAGISEL-GFX10-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
-  ; DAGISEL-GFX10-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %p = extractvalue {ptr, i32, <4 x i32>} %a, 0
   %i = extractvalue {ptr, i32, <4 x i32>} %a, 1
   %v = extractvalue {ptr, i32, <4 x i32>} %a, 2
@@ -497,29 +741,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_float(float in
   ; GISEL-GFX10-NEXT:   FLAT_STORE_DWORD [[COPY3]], [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_float
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_float
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %c = fadd float %a, %b
   store float %c, ptr poison
   ret void
@@ -552,29 +820,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
   ; GISEL-GFX10-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_half
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_half
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %c = fadd half %a, %b
   store half %c, ptr poison
   ret void
@@ -607,33 +899,93 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; GISEL-GFX10-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %c = fadd bfloat %a, %b
   store bfloat %c, ptr poison
   ret void
@@ -666,29 +1018,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; GISEL-GFX10-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %c = add i16 %a, %b
   store i16 %c, ptr poison
   ret void
@@ -787,101 +1163,197 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i
   ; GISEL-GFX10-NEXT:   FLAT_STORE_DWORDX4 [[COPY27]], [[COPY25]], 16, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into `ptr poison` + 16, basealign 32)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
-  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
-  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
-  ; DAGISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
-  ; DAGISEL-GFX11-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; DAGISEL-GFX11-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; DAGISEL-GFX11-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX11-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX11-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX11-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
-  ; DAGISEL-GFX11-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
-  ; DAGISEL-GFX11-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
-  ; DAGISEL-GFX11-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
-  ; DAGISEL-GFX11-NEXT:   FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
-  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
-  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
-  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
-  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
-  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
-  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
-  ; DAGISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; DAGISEL-GFX10-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; DAGISEL-GFX10-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; DAGISEL-GFX10-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; DAGISEL-GFX10-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; DAGISEL-GFX10-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
-  ; DAGISEL-GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
-  ; DAGISEL-GFX10-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
-  ; DAGISEL-GFX10-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
-  ; DAGISEL-GFX10-NEXT:   FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+  ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %c = add <16 x i16> %a, %b
   store <16 x i16> %c, ptr poison
   ret void
@@ -922,45 +1394,85 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i
   ; GISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY5]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (<2 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_many_regs
-  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr35, $vgpr8, $vgpr135
-  ; DAGISEL-GFX11-NEXT: {{  $}}
-  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
-  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
-  ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
-  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX11-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_many_regs
-  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr35, $vgpr8, $vgpr135
-  ; DAGISEL-GFX10-NEXT: {{  $}}
-  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
-  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
-  ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
-  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX10-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
-  ; DAGISEL-GFX10-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr35, $vgpr8, $vgpr135
+  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF32-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr35, $vgpr8, $vgpr135
+  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-WF64-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+  ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF32-NEXT:   liveins: $sgpr35, $vgpr8, $vgpr135
+  ; DAGISEL-GFX10-WF32-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; DAGISEL-GFX10-WF32-NEXT:   [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+  ; DAGISEL-GFX10-WF32-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF32-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF32-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+  ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-WF64-NEXT:   liveins: $sgpr35, $vgpr8, $vgpr135
+  ; DAGISEL-GFX10-WF64-NEXT: {{  $}}
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; DAGISEL-GFX10-WF64-NEXT:   [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+  ; DAGISEL-GFX10-WF64-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-WF64-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; DAGISEL-GFX10-WF64-NEXT:   S_ENDPGM 0
   %c = extractelement <36 x i32> %a, i32 35
   store i32 %c, ptr addrspace(1) poison
 
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 3be4665cf3a005..27398ecb77bd56 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1398,61 +1398,75 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, -4, v0
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    ds_read_b32 v3, v1
+; VI-NEXT:    ds_read_b32 v4, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_mov_b32 s4, 0xffff
 ; VI-NEXT:    v_and_b32_e32 v0, 24, v2
 ; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
 ; VI-NEXT:    v_not_b32_e32 v2, v2
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fc0
 ; VI-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v4, v3
-; VI-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_f32_e32 v3, 4.0, v3
-; VI-NEXT:    v_and_b32_e32 v5, v4, v2
-; VI-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v3, v5, v3
-; VI-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_lshrrev_b32_sdwa v4, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; VI-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; VI-NEXT:    v_and_b32_e32 v6, v5, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; VI-NEXT:    v_or_b32_e32 v4, v6, v4
+; VI-NEXT:    ds_cmpst_rtn_b32 v4, v1, v5, v4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB10_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v0, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fadd_ret_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, -4, v0
-; GFX9-NEXT:    ds_read_b32 v2, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX9-NEXT:    ds_read_b32 v4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, 24, v3
-; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s4
-; GFX9-NEXT:    v_not_b32_e32 v3, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc0
 ; GFX9-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_or_b32 v2, v4, v3, v2
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX9-NEXT:    v_and_or_b32 v4, v5, v2, v4
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v4, v1, v5, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_bf16:
@@ -1539,19 +1553,26 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
 ; VI-NEXT:    v_not_b32_e32 v2, v2
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x7fc0
 ; VI-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; VI-NEXT:    v_and_b32_e32 v5, v3, v2
-; VI-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; VI-NEXT:    v_lshrrev_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f32_e32 v5, 4.0, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
+; VI-NEXT:    v_and_b32_e32 v6, v3, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v5, v0, v5
+; VI-NEXT:    v_or_b32_e32 v5, v6, v5
+; VI-NEXT:    ds_cmpst_rtn_b32 v5, v1, v3, v5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v3, v4
+; VI-NEXT:    v_mov_b32_e32 v3, v5
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB11_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1569,18 +1590,25 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc0
 ; GFX9-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_f32_e32 v5, 4.0, v5
+; GFX9-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX9-NEXT:    v_add3_u32 v6, v6, v5, s6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, v0, v5
+; GFX9-NEXT:    v_and_or_b32 v5, v3, v2, v5
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v5, v1, v3, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index d76bb48b4a82a3..377773be7d412e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -4237,57 +4237,99 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
 define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
 ; GFX9-LABEL: fma_shuffle_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc0
+; GFX9-NEXT:    s_mov_b32 s3, 0x5040100
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v0, s[0:1]
+; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[8:9]
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_fma_f32 v7, v8, v9, v7
-; GFX9-NEXT:    v_fma_f32 v0, v8, v4, v0
-; GFX9-NEXT:    v_fma_f32 v8, v12, v9, v11
-; GFX9-NEXT:    v_fma_f32 v1, v12, v4, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_fma_f32 v8, v9, v10, v8
+; GFX9-NEXT:    v_fma_f32 v1, v9, v5, v1
+; GFX9-NEXT:    v_fma_f32 v2, v13, v5, v2
+; GFX9-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX9-NEXT:    v_fma_f32 v9, v13, v10, v12
+; GFX9-NEXT:    v_bfe_u32 v10, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v5, v5, v8, s2
+; GFX9-NEXT:    v_bfe_u32 v12, v9, 16, 1
+; GFX9-NEXT:    v_add3_u32 v10, v10, v1, s2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v12, v12, v9, s2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_add3_u32 v13, v13, v2, s2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX9-NEXT:    v_fma_f32 v0, v2, v10, v0
-; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v4
-; GFX9-NEXT:    v_fma_f32 v1, v3, v10, v1
-; GFX9-NEXT:    v_fma_f32 v3, v3, v5, v7
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
-; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v12, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v13, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_fma_f32 v1, v3, v11, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_fma_f32 v3, v3, v6, v5
+; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT:    v_fma_f32 v2, v4, v11, v2
+; GFX9-NEXT:    v_fma_f32 v4, v4, v6, v8
+; GFX9-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s2
+; GFX9-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v6, v6, v3, s2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v8, v8, v2, s2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_add3_u32 v9, v9, v4, s2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v4, v2, s3
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fma_shuffle_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[10:11]
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -4296,73 +4338,164 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v9
 ; GFX10-NEXT:    v_fmac_f32_e32 v0, v8, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v11, v10, v4
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v10, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v4
+; GFX10-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v1, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v2, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v12
-; GFX10-NEXT:    v_fmac_f32_e32 v4, v3, v12
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v5
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v5
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
-; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
+; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v8, v8, v7, 0x7fff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
+; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fma_shuffle_v2bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v1, v10, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v3, v5 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_dual_fmac_f32 v11, v10, v4 :: v_dual_lshlrev_b32 v8, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_fmac_f32 v10, v11, v9 :: v_dual_and_b32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v11, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v9
+; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v11, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_fmac_f32 v7, v8, v9 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fmac_f32_e32 v0, v8, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_fmac_f32 v4, v3, v12 :: v_dual_fmac_f32 v7, v2, v5
+; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, 0x7fc0, v9 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v3, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v7, v3, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v12
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v2, v5
+; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v8, v8, v7, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index a9faa130d6379f..884860712632d7 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK,SM80 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
 ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}



More information about the llvm-commits mailing list