[llvm] Correctly round FP -> BF16 when SDAG expands such nodes (PR #82399)
David Majnemer via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 20 10:34:30 PST 2024
https://github.com/majnemer created https://github.com/llvm/llvm-project/pull/82399
We did something pretty naive:
- round FP64 -> BF16 by first rounding to FP32
- skip FP32 -> BF16 rounding entirely
- taking the top 16 bits of a FP32 which will turn some NaNs into infinities
Let's do this in a more principled way by rounding types with more precision than FP32 to FP32 using round-inexact-to-odd which will negate double rounding issues.
>From 4a278ac29d39c44cbe291113f10da2b2a6976d66 Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer at gmail.com>
Date: Thu, 15 Feb 2024 18:15:45 +0000
Subject: [PATCH] Correctly round FP -> BF16 when SDAG expands such nodes
We did something pretty naive:
- round FP64 -> BF16 by first rounding to FP32
- skip FP32 -> BF16 rounding entirely
- taking the top 16 bits of a FP32 which will turn some NaNs into infinities
Let's do this in a more principled way by rounding types with more
precision than FP32 to FP32 using round-inexact-to-odd which will
negate double rounding issues.
---
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 94 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 53 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 3 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 10 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 15370 +++++++++++++---
.../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 16 +-
llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 286 +-
.../isel-amdgpu-cs-chain-preserve-cc.ll | 1462 +-
llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 104 +-
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 293 +-
llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 2 +-
11 files changed, 14119 insertions(+), 3574 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 252b6e9997a710..3426956a41b3d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3219,8 +3219,98 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
case ISD::FP_ROUND: {
EVT VT = Node->getValueType(0);
if (VT.getScalarType() == MVT::bf16) {
- Results.push_back(
- DAG.getNode(ISD::FP_TO_BF16, SDLoc(Node), VT, Node->getOperand(0)));
+ if (Node->getConstantOperandVal(1) == 1) {
+ Results.push_back(
+ DAG.getNode(ISD::FP_TO_BF16, SDLoc(Node), VT, Node->getOperand(0)));
+ break;
+ }
+ SDValue Op = Node->getOperand(0);
+ SDValue IsNaN = DAG.getSetCC(dl, getSetCCResultType(Op.getValueType()),
+ Op, Op, ISD::SETUO);
+ if (Op.getValueType() != MVT::f32) {
+ // We are rounding binary64/binary128 -> binary32 -> bfloat16. This
+ // can induce double-rounding which may alter the results. We can
+ // correct for this using a trick explained in: Boldo, Sylvie, and
+ // Guillaume Melquiond. "When double rounding is odd." 17th IMACS
+ // World Congress. 2005.
+ FloatSignAsInt ValueAsInt;
+ getSignAsIntValue(ValueAsInt, dl, Op);
+ EVT WideIntVT = ValueAsInt.IntValue.getValueType();
+ SDValue SignMask = DAG.getConstant(ValueAsInt.SignMask, dl, WideIntVT);
+ SDValue SignBit =
+ DAG.getNode(ISD::AND, dl, WideIntVT, ValueAsInt.IntValue, SignMask);
+ SDValue AbsWide;
+ if (TLI.isOperationLegalOrCustom(ISD::FABS, ValueAsInt.FloatVT)) {
+ AbsWide = DAG.getNode(ISD::FABS, dl, ValueAsInt.FloatVT, Op);
+ } else {
+ SDValue ClearSignMask =
+ DAG.getConstant(~ValueAsInt.SignMask, dl, WideIntVT);
+ SDValue ClearedSign = DAG.getNode(ISD::AND, dl, WideIntVT,
+ ValueAsInt.IntValue, ClearSignMask);
+ AbsWide = modifySignAsInt(ValueAsInt, dl, ClearedSign);
+ }
+ SDValue AbsNarrow =
+ DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, AbsWide,
+ DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+ SDValue AbsNarrowAsWide =
+ DAG.getNode(ISD::FP_EXTEND, dl, ValueAsInt.FloatVT, AbsNarrow);
+
+ // We can keep the narrow value as-is if narrowing was exact (no
+ // rounding error), the wide value was NaN (the narrow value is also
+ // NaN and should be preserved) or if we rounded to the odd value.
+ SDValue NarrowBits = DAG.getNode(ISD::BITCAST, dl, MVT::i32, AbsNarrow);
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue NegativeOne = DAG.getConstant(-1, dl, MVT::i32);
+ SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, NarrowBits, One);
+ EVT I32CCVT = getSetCCResultType(And.getValueType());
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+ SDValue AlreadyOdd = DAG.getSetCC(dl, I32CCVT, And, Zero, ISD::SETNE);
+
+ EVT WideSetCCVT = getSetCCResultType(AbsWide.getValueType());
+ SDValue KeepNarrow = DAG.getSetCC(dl, WideSetCCVT, AbsWide,
+ AbsNarrowAsWide, ISD::SETUEQ);
+ KeepNarrow =
+ DAG.getNode(ISD::OR, dl, WideSetCCVT, KeepNarrow, AlreadyOdd);
+ // We morally performed a round-down if `abs_narrow` is smaller than
+ // `abs_wide`.
+ SDValue NarrowIsRd = DAG.getSetCC(dl, WideSetCCVT, AbsWide,
+ AbsNarrowAsWide, ISD::SETOGT);
+ // If the narrow value is odd or exact, pick it.
+ // Otherwise, narrow is even and corresponds to either the rounded-up
+ // or rounded-down value. If narrow is the rounded-down value, we want
+ // the rounded-up value as it will be odd.
+ SDValue Adjust =
+ DAG.getSelect(dl, MVT::i32, NarrowIsRd, One, NegativeOne);
+ Adjust = DAG.getSelect(dl, MVT::i32, KeepNarrow, Zero, Adjust);
+ int ShiftAmount = ValueAsInt.SignBit - 31;
+ SDValue ShiftCnst = DAG.getConstant(
+ ShiftAmount, dl,
+ TLI.getShiftAmountTy(WideIntVT, DAG.getDataLayout()));
+ SignBit = DAG.getNode(ISD::SRL, dl, WideIntVT, SignBit, ShiftCnst);
+ SignBit = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, SignBit);
+ Op = DAG.getNode(ISD::OR, dl, MVT::i32, Adjust, SignBit);
+ } else {
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+ }
+
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue Lsb = DAG.getNode(
+ ISD::SRL, dl, MVT::i32, Op,
+ DAG.getConstant(16, dl,
+ TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+ Lsb = DAG.getNode(ISD::AND, dl, MVT::i32, Lsb, One);
+ SDValue RoundingBias = DAG.getNode(
+ ISD::ADD, dl, MVT::i32, DAG.getConstant(0x7fff, dl, MVT::i32), Lsb);
+ SDValue Add = DAG.getNode(ISD::ADD, dl, MVT::i32, Op, RoundingBias);
+ Op = DAG.getNode(
+ ISD::SRL, dl, MVT::i32, Add,
+ DAG.getConstant(16, dl,
+ TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+ Op = DAG.getSelect(dl, MVT::i32, IsNaN,
+ DAG.getConstant(0x00007fc0, dl, MVT::i32), Op);
+
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Op);
+ Results.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::bf16, Op));
break;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7f58b312e7a201..e75799ca13b0bb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -776,6 +776,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
AddPromotedToType(Op, MVT::bf16, MVT::f32);
}
+ if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
+ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);
+ }
+
// sm_80 only has conversions between f32 and bf16. Custom lower all other
// bf16 conversions.
if (STI.hasBF16Math() &&
@@ -2465,6 +2470,50 @@ SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
return Op;
}
+SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getValueType() == MVT::bf16) {
+ if (Op.getOperand(0).getValueType() == MVT::f32 &&
+ (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70)) {
+ SDLoc Loc(Op);
+ return DAG.getNode(ISD::FP_TO_BF16, Loc, MVT::bf16, Op.getOperand(0));
+ }
+ if (Op.getOperand(0).getValueType() == MVT::f64 &&
+ (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
+ SDLoc Loc(Op);
+ return DAG.getNode(ISD::FP_TO_BF16, Loc, MVT::bf16, Op.getOperand(0));
+ }
+ }
+
+ // Everything else is considered legal.
+ return Op;
+}
+
+SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getOperand(0).getValueType() == MVT::bf16) {
+ if (Op.getValueType() == MVT::f32 &&
+ (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
+ SDLoc Loc(Op);
+ return DAG.getNode(ISD::BF16_TO_FP, Loc, Op.getValueType(),
+ Op.getOperand(0));
+ }
+ if (Op.getValueType() == MVT::f64 &&
+ (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
+ SDLoc Loc(Op);
+ if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
+ Op = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0));
+ return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, Op);
+ }
+ return DAG.getNode(ISD::BF16_TO_FP, Loc, Op.getValueType(),
+ Op.getOperand(0));
+ }
+ }
+
+ // Everything else is considered legal.
+ return Op;
+}
+
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
if (Op.getValueType() != MVT::v2i16)
@@ -2527,6 +2576,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
+ case ISD::FP_ROUND:
+ return LowerFP_ROUND(Op, DAG);
+ case ISD::FP_EXTEND:
+ return LowerFP_EXTEND(Op, DAG);
case ISD::VAARG:
return LowerVAARG(Op, DAG);
case ISD::VASTART:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5d3fd992812ef9..cf1d4580766918 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -618,6 +618,9 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 631136ad621464..40d82ebecbed35 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -662,7 +662,7 @@ let hasSideEffects = false in {
// bf16->f32 was introduced early.
[hasPTX<71>, hasSM<80>],
// bf16->everything else needs sm90/ptx78
- [hasPTX<78>, hasSM<90>])>;
+ [hasPTX<78>, hasSM<90>])>;
def _f32 :
NVPTXInst<(outs RC:$dst),
(ins Float32Regs:$src, CvtMode:$mode),
@@ -3647,7 +3647,7 @@ def : Pat<(f16 (fpround Float32Regs:$a)),
// fpround f32 -> bf16
def : Pat<(bf16 (fpround Float32Regs:$a)),
- (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
+ (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
// fpround f64 -> f16
def : Pat<(f16 (fpround Float64Regs:$a)),
@@ -3655,7 +3655,7 @@ def : Pat<(f16 (fpround Float64Regs:$a)),
// fpround f64 -> bf16
def : Pat<(bf16 (fpround Float64Regs:$a)),
- (CVT_bf16_f64 Float64Regs:$a, CvtRN)>;
+ (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
// fpround f64 -> f32
def : Pat<(f32 (fpround Float64Regs:$a)),
(CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
@@ -3671,7 +3671,7 @@ def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
(CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
- (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>;
+ (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
// fpextend f16 -> f64
def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
@@ -3679,7 +3679,7 @@ def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
// fpextend bf16 -> f64
def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
- (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>;
+ (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
// fpextend f32 -> f64
def : Pat<(f64 (fpextend Float32Regs:$a)),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 387c4a16a008ae..39cb0a768701c0 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1918,8 +1918,14 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fc0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX8-NEXT: flat_store_short v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -1928,8 +1934,15 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1938,7 +1951,12 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_f32_to_bf16:
@@ -1946,7 +1964,14 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load float, ptr addrspace(1) %in
%val.bf16 = fptrunc float %val to bfloat
@@ -1989,9 +2014,25 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x7fc0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v1
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GFX8-NEXT: v_or_b32_e32 v5, v4, v8
+; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc
; GFX8-NEXT: flat_store_short v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -2000,9 +2041,26 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_brev_b32 s8, 1
+; GFX9-NEXT: s_movk_i32 s9, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
+; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4
+; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -2011,8 +2069,22 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
+; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5
+; GFX10-NEXT: s_or_b32 s4, s4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4
+; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
+; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_f64_to_bf16:
@@ -2020,8 +2092,27 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
+; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX11-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
+; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
+; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
+; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load double, ptr addrspace(1) %in
%val.bf16 = fptrunc double %val to bfloat
@@ -8487,7 +8578,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_bf16:
@@ -8496,7 +8593,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16:
@@ -8505,7 +8608,11 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_bf16:
@@ -8515,7 +8622,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd bfloat %a, %b
ret bfloat %op
@@ -8553,12 +8666,25 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v2bf16:
@@ -8566,11 +8692,23 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -8583,7 +8721,17 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add_f32_e32 v2, v3, v2
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v2bf16:
@@ -8596,8 +8744,22 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <2 x bfloat> %a, %b
ret <2 x bfloat> %op
@@ -8644,15 +8806,34 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v3bf16:
@@ -8661,32 +8842,98 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <3 x bfloat> %a, %b
ret <3 x bfloat> %op
}
@@ -8739,20 +8986,46 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v4bf16:
@@ -8760,17 +9033,39 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_add_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8781,17 +9076,37 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_add_f32_e32 v3, v7, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v4bf16:
@@ -8803,15 +9118,40 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <4 x bfloat> %a, %b
ret <4 x bfloat> %op
@@ -8897,36 +9237,86 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v9, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v8bf16:
@@ -8934,29 +9324,71 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_add_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_add_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_add_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_add_f32_e32 v6, v9, v6
+; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_add_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
@@ -8975,58 +9407,151 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: v_add_f32_e32 v8, v9, v8
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_add_f32_e32 v9, v11, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_add_f32_e32 v10, v11, v10
-; GFX10-NEXT: v_add_f32_e32 v11, v13, v12
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_add_f32_e32 v10, v10, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_add_f32_e32 v11, v12, v11
+; GFX10-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v3, v3, v7
+; GFX11-NEXT: v_add_f32_e32 v9, v11, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_dual_add_f32 v10, v10, v12 :: v_dual_add_f32 v1, v1, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v11, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_f32_e32 v11, v12, v11
+; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_dual_add_f32 v10, v11, v10 :: v_dual_add_f32 v11, v13, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %op
@@ -9180,122 +9705,302 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v15, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT: v_add_f32_e32 v15, v18, v15
+; GFX8-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v15, v17, v15
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v14, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT: v_add_f32_e32 v14, v18, v14
+; GFX8-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_add_f32_e32 v14, v17, v14
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v13, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT: v_add_f32_e32 v13, v18, v13
+; GFX8-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v13, v17, v13
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v12, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT: v_add_f32_e32 v12, v18, v12
+; GFX8-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_add_f32_e32 v12, v17, v12
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT: v_add_f32_e32 v11, v18, v11
+; GFX8-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v11, v17, v11
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v10, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_add_f32_e32 v10, v18, v10
+; GFX8-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v10, v17, v10
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_add_f32_e32 v9, v18, v9
+; GFX8-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_add_f32_e32 v9, v17, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v16, v7
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_fadd_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_add_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v7, v7, v15
+; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v15, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_add_f32_e32 v15, v17, v15
+; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v14, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_add_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_add_f32_e32 v14, v17, v14
+; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_add_f32_e32 v5, v5, v13
+; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v13, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_add_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_add_f32_e32 v13, v17, v13
+; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_add_f32_e32 v4, v4, v12
+; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v12, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_add_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_add_f32_e32 v12, v17, v12
+; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v11, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_add_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_add_f32_e32 v11, v17, v11
+; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v10
+; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_add_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_add_f32_e32 v10, v17, v10
+; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_add_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_add_f32_e32 v9, v17, v9
+; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
@@ -9313,119 +10018,294 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_add_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_add_f32_e32 v7, v7, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v14
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT: v_add_f32_e32 v15, v17, v15
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT: v_add_f32_e32 v14, v19, v18
+; GFX10-NEXT: v_add_f32_e32 v14, v20, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX10-NEXT: v_add_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_add_f32_e32 v13, v21, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v12
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v17, v18, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_add_f32_e32 v17, v20, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v12
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v18, v20, v19
+; GFX10-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT: v_add_f32_e32 v12, v19, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_add_f32_e32 v11, v19, v18
+; GFX10-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT: v_add_f32_e32 v10, v21, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_add_f32_e32 v19, v20, v19
-; GFX10-NEXT: v_add_f32_e32 v20, v22, v21
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT: v_add_f32_e32 v17, v19, v17
; GFX10-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_add_f32_e32 v13, v21, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v7, v7, v15
; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT: v_add_f32_e32 v15, v17, v15
+; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_add_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT: v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v14, v20, v18 :: v_dual_add_f32 v5, v5, v13
+; GFX11-NEXT: v_add_f32_e32 v16, v17, v16
+; GFX11-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v18
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT: v_add_f32_e32 v14, v19, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT: v_add_f32_e32 v17, v20, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_add_f32 v4, v4, v12
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add_f32_e32 v12, v19, v18
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11-NEXT: v_add_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_dual_add_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_f32_e32 v11, v19, v18
+; GFX11-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v19, v20, v19 :: v_dual_add_f32 v20, v22, v21
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT: v_dual_add_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v17, v19, v17 :: v_dual_add_f32 v2, v2, v10
+; GFX11-NEXT: v_add_f32_e32 v10, v21, v20
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v8
+; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <16 x bfloat> %a, %b
ret <16 x bfloat> %op
@@ -9833,247 +10713,716 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-LABEL: v_fadd_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: v_writelane_b32 v35, s30, 0
+; GFX8-NEXT: v_writelane_b32 v35, s31, 1
+; GFX8-NEXT: v_writelane_b32 v35, s34, 2
+; GFX8-NEXT: v_writelane_b32 v35, s35, 3
+; GFX8-NEXT: v_writelane_b32 v35, s36, 4
+; GFX8-NEXT: v_writelane_b32 v35, s37, 5
+; GFX8-NEXT: v_writelane_b32 v35, s38, 6
+; GFX8-NEXT: v_writelane_b32 v35, s39, 7
+; GFX8-NEXT: v_writelane_b32 v35, s40, 8
+; GFX8-NEXT: v_writelane_b32 v35, s41, 9
+; GFX8-NEXT: v_writelane_b32 v35, s42, 10
+; GFX8-NEXT: v_writelane_b32 v35, s43, 11
+; GFX8-NEXT: v_writelane_b32 v35, s44, 12
+; GFX8-NEXT: v_writelane_b32 v35, s45, 13
+; GFX8-NEXT: v_writelane_b32 v35, s46, 14
+; GFX8-NEXT: v_writelane_b32 v35, s47, 15
+; GFX8-NEXT: v_writelane_b32 v35, s48, 16
+; GFX8-NEXT: v_writelane_b32 v35, s49, 17
+; GFX8-NEXT: v_writelane_b32 v35, s50, 18
+; GFX8-NEXT: v_writelane_b32 v35, s51, 19
+; GFX8-NEXT: v_writelane_b32 v35, s52, 20
+; GFX8-NEXT: v_writelane_b32 v35, s53, 21
+; GFX8-NEXT: v_writelane_b32 v35, s54, 22
+; GFX8-NEXT: v_writelane_b32 v35, s55, 23
+; GFX8-NEXT: v_writelane_b32 v35, s56, 24
+; GFX8-NEXT: v_writelane_b32 v35, s57, 25
+; GFX8-NEXT: v_writelane_b32 v35, s58, 26
+; GFX8-NEXT: v_writelane_b32 v35, s59, 27
+; GFX8-NEXT: v_writelane_b32 v35, s60, 28
+; GFX8-NEXT: v_writelane_b32 v35, s61, 29
+; GFX8-NEXT: v_writelane_b32 v35, s62, 30
+; GFX8-NEXT: v_writelane_b32 v35, s63, 31
+; GFX8-NEXT: v_writelane_b32 v35, s64, 32
+; GFX8-NEXT: v_writelane_b32 v35, s65, 33
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_writelane_b32 v35, s66, 34
; GFX8-NEXT: v_add_f32_e32 v31, v32, v31
; GFX8-NEXT: v_add_f32_e32 v30, v14, v30
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT: v_writelane_b32 v35, s67, 35
+; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT: v_writelane_b32 v35, s68, 36
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT: s_movk_i32 s68, 0x7fff
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v13
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_add_f32_e32 v14, v32, v14
-; GFX8-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_add_f32_e32 v31, v31, v30
+; GFX8-NEXT: v_add_f32_e32 v29, v13, v29
+; GFX8-NEXT: v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT: v_add_f32_e32 v31, v31, v29
+; GFX8-NEXT: v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_add_f32_e32 v29, v32, v29
-; GFX8-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT: v_add_f32_e32 v28, v12, v28
+; GFX8-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT: v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT: v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT: v_add_f32_e32 v31, v15, v31
+; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT: v_add_f32_e32 v28, v28, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v11
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_add_f32_e32 v28, v32, v28
-; GFX8-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT: v_add_f32_e32 v34, v34, v31
+; GFX8-NEXT: v_add_f32_e32 v27, v11, v27
+; GFX8-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_add_f32_e32 v27, v32, v27
-; GFX8-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT: v_add_f32_e32 v34, v34, v27
+; GFX8-NEXT: v_add_f32_e32 v26, v10, v26
+; GFX8-NEXT: v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v9
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_add_f32_e32 v26, v32, v26
-; GFX8-NEXT: v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT: v_add_f32_e32 v34, v34, v26
+; GFX8-NEXT: v_add_f32_e32 v25, v9, v25
+; GFX8-NEXT: v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v8
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT: v_add_f32_e32 v25, v32, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_add_f32_e32 v32, v32, v33
-; GFX8-NEXT: v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT: v_add_f32_e32 v34, v34, v25
+; GFX8-NEXT: v_add_f32_e32 v24, v8, v24
+; GFX8-NEXT: v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v7
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_add_f32_e32 v24, v33, v24
-; GFX8-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT: v_add_f32_e32 v34, v34, v24
+; GFX8-NEXT: v_add_f32_e32 v23, v7, v23
+; GFX8-NEXT: v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT: v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v23, v33, v23
+; GFX8-NEXT: v_add_f32_e32 v34, v34, v23
; GFX8-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT: v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT: v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT: v_add_f32_e32 v6, v34, v6
+; GFX8-NEXT: v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_add_f32_e32 v22, v33, v22
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v6
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT: v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v21, v5
+; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT: v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v21, v33, v21
-; GFX8-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT: v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT: v_add_f32_e32 v4, v20, v4
+; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT: v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_add_f32_e32 v20, v33, v20
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT: v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT: v_add_f32_e32 v3, v19, v3
+; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT: v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v19, v33, v19
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT: v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_add_f32_e32 v2, v18, v2
+; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT: v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v18, v33, v18
-; GFX8-NEXT: v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT: v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_add_f32_e32 v1, v17, v1
+; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT: v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX8-NEXT: v_add_f32_e32 v17, v33, v17
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT: v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT: v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], v0, v28
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], s68, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT: v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT: v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT: v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT: v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT: v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT: v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT: v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT: v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT: v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT: v_readlane_b32 s68, v35, 36
+; GFX8-NEXT: v_readlane_b32 s67, v35, 35
+; GFX8-NEXT: v_readlane_b32 s66, v35, 34
+; GFX8-NEXT: v_readlane_b32 s65, v35, 33
+; GFX8-NEXT: v_readlane_b32 s64, v35, 32
+; GFX8-NEXT: v_readlane_b32 s63, v35, 31
+; GFX8-NEXT: v_readlane_b32 s62, v35, 30
+; GFX8-NEXT: v_readlane_b32 s61, v35, 29
+; GFX8-NEXT: v_readlane_b32 s60, v35, 28
+; GFX8-NEXT: v_readlane_b32 s59, v35, 27
+; GFX8-NEXT: v_readlane_b32 s58, v35, 26
+; GFX8-NEXT: v_readlane_b32 s57, v35, 25
+; GFX8-NEXT: v_readlane_b32 s56, v35, 24
+; GFX8-NEXT: v_readlane_b32 s55, v35, 23
+; GFX8-NEXT: v_readlane_b32 s54, v35, 22
+; GFX8-NEXT: v_readlane_b32 s53, v35, 21
+; GFX8-NEXT: v_readlane_b32 s52, v35, 20
+; GFX8-NEXT: v_readlane_b32 s51, v35, 19
+; GFX8-NEXT: v_readlane_b32 s50, v35, 18
+; GFX8-NEXT: v_readlane_b32 s49, v35, 17
+; GFX8-NEXT: v_readlane_b32 s48, v35, 16
+; GFX8-NEXT: v_readlane_b32 s47, v35, 15
+; GFX8-NEXT: v_readlane_b32 s46, v35, 14
+; GFX8-NEXT: v_readlane_b32 s45, v35, 13
+; GFX8-NEXT: v_readlane_b32 s44, v35, 12
+; GFX8-NEXT: v_readlane_b32 s43, v35, 11
+; GFX8-NEXT: v_readlane_b32 s42, v35, 10
+; GFX8-NEXT: v_readlane_b32 s41, v35, 9
+; GFX8-NEXT: v_readlane_b32 s40, v35, 8
+; GFX8-NEXT: v_readlane_b32 s39, v35, 7
+; GFX8-NEXT: v_readlane_b32 s38, v35, 6
+; GFX8-NEXT: v_readlane_b32 s37, v35, 5
+; GFX8-NEXT: v_readlane_b32 s36, v35, 4
+; GFX8-NEXT: v_readlane_b32 s35, v35, 3
+; GFX8-NEXT: v_readlane_b32 s34, v35, 2
+; GFX8-NEXT: v_readlane_b32 s31, v35, 1
+; GFX8-NEXT: v_readlane_b32 s30, v35, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT: v_writelane_b32 v35, s30, 0
+; GFX9-NEXT: v_writelane_b32 v35, s31, 1
+; GFX9-NEXT: v_writelane_b32 v35, s34, 2
+; GFX9-NEXT: v_writelane_b32 v35, s35, 3
+; GFX9-NEXT: v_writelane_b32 v35, s36, 4
+; GFX9-NEXT: v_writelane_b32 v35, s37, 5
+; GFX9-NEXT: v_writelane_b32 v35, s38, 6
+; GFX9-NEXT: v_writelane_b32 v35, s39, 7
+; GFX9-NEXT: v_writelane_b32 v35, s40, 8
+; GFX9-NEXT: v_writelane_b32 v35, s41, 9
+; GFX9-NEXT: v_writelane_b32 v35, s42, 10
+; GFX9-NEXT: v_writelane_b32 v35, s43, 11
+; GFX9-NEXT: v_writelane_b32 v35, s44, 12
+; GFX9-NEXT: v_writelane_b32 v35, s45, 13
+; GFX9-NEXT: v_writelane_b32 v35, s46, 14
+; GFX9-NEXT: v_writelane_b32 v35, s47, 15
+; GFX9-NEXT: v_writelane_b32 v35, s48, 16
+; GFX9-NEXT: v_writelane_b32 v35, s49, 17
+; GFX9-NEXT: v_writelane_b32 v35, s50, 18
+; GFX9-NEXT: v_writelane_b32 v35, s51, 19
+; GFX9-NEXT: v_writelane_b32 v35, s52, 20
+; GFX9-NEXT: v_writelane_b32 v35, s53, 21
+; GFX9-NEXT: v_writelane_b32 v35, s54, 22
+; GFX9-NEXT: v_writelane_b32 v35, s55, 23
+; GFX9-NEXT: v_writelane_b32 v35, s56, 24
+; GFX9-NEXT: v_writelane_b32 v35, s57, 25
+; GFX9-NEXT: v_writelane_b32 v35, s58, 26
+; GFX9-NEXT: v_writelane_b32 v35, s59, 27
+; GFX9-NEXT: v_writelane_b32 v35, s60, 28
+; GFX9-NEXT: v_writelane_b32 v35, s61, 29
+; GFX9-NEXT: v_writelane_b32 v35, s62, 30
+; GFX9-NEXT: v_writelane_b32 v35, s63, 31
+; GFX9-NEXT: v_writelane_b32 v35, s64, 32
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_add_f32_e32 v14, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s65, 33
+; GFX9-NEXT: v_add_f32_e32 v32, v32, v31
+; GFX9-NEXT: v_add_f32_e32 v30, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s66, 34
+; GFX9-NEXT: s_movk_i32 s66, 0x7fff
+; GFX9-NEXT: v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT: v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_add_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT: v_add_f32_e32 v32, v32, v30
+; GFX9-NEXT: v_add_f32_e32 v29, v13, v29
+; GFX9-NEXT: v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT: v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT: v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_add_f32_e32 v29, v32, v29
-; GFX9-NEXT: v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT: v_add_f32_e32 v32, v32, v29
+; GFX9-NEXT: v_add_f32_e32 v28, v12, v28
+; GFX9-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT: v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT: v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT: v_add_f32_e32 v32, v32, v28
+; GFX9-NEXT: v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_add_f32_e32 v28, v32, v28
-; GFX9-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX9-NEXT: v_add_f32_e32 v15, v15, v33
+; GFX9-NEXT: v_add_f32_e32 v27, v11, v27
+; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT: v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT: v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_add_f32_e32 v27, v32, v27
-; GFX9-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v27
+; GFX9-NEXT: v_add_f32_e32 v26, v10, v26
+; GFX9-NEXT: v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT: v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT: v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_add_f32_e32 v26, v32, v26
-; GFX9-NEXT: v_add_f32_e32 v9, v9, v25
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v26
+; GFX9-NEXT: v_add_f32_e32 v25, v9, v25
+; GFX9-NEXT: v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT: v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT: v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT: v_add_f32_e32 v25, v32, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_add_f32_e32 v32, v32, v33
-; GFX9-NEXT: v_add_f32_e32 v15, v15, v24
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v25
+; GFX9-NEXT: v_add_f32_e32 v24, v8, v24
+; GFX9-NEXT: v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT: v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT: v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_add_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v24
+; GFX9-NEXT: v_add_f32_e32 v23, v7, v23
+; GFX9-NEXT: v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT: v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT: v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_add_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v23
+; GFX9-NEXT: v_add_f32_e32 v22, v6, v22
+; GFX9-NEXT: v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT: v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT: v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[40:41], v22, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_add_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_add_f32_e32 v5, v5, v21
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v22
+; GFX9-NEXT: v_add_f32_e32 v21, v5, v21
+; GFX9-NEXT: v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT: v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT: v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[44:45], v21, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_add_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v21
+; GFX9-NEXT: v_add_f32_e32 v20, v4, v20
+; GFX9-NEXT: v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT: v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT: v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[48:49], v20, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_add_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v19
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v20
+; GFX9-NEXT: v_add_f32_e32 v19, v3, v19
+; GFX9-NEXT: v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT: v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT: v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[52:53], v19, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_add_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v19
+; GFX9-NEXT: v_add_f32_e32 v18, v2, v18
+; GFX9-NEXT: v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT: v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT: v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[56:57], v18, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_add_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v18
+; GFX9-NEXT: v_add_f32_e32 v17, v1, v17
+; GFX9-NEXT: v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT: v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[60:61], v17, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_add_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT: v_add_f32_e32 v16, v0, v16
+; GFX9-NEXT: v_add_f32_e32 v32, v32, v34
+; GFX9-NEXT: v_add_f32_e32 v33, v33, v17
+; GFX9-NEXT: v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT: v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT: v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT: v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT: v_writelane_b32 v35, s67, 35
+; GFX9-NEXT: v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT: v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT: v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT: v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT: v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT: v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT: v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
@@ -10082,13 +11431,72 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4
+; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT: v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT: v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT: v_readlane_b32 s67, v35, 35
+; GFX9-NEXT: v_readlane_b32 s66, v35, 34
+; GFX9-NEXT: v_readlane_b32 s65, v35, 33
+; GFX9-NEXT: v_readlane_b32 s64, v35, 32
+; GFX9-NEXT: v_readlane_b32 s63, v35, 31
+; GFX9-NEXT: v_readlane_b32 s62, v35, 30
+; GFX9-NEXT: v_readlane_b32 s61, v35, 29
+; GFX9-NEXT: v_readlane_b32 s60, v35, 28
+; GFX9-NEXT: v_readlane_b32 s59, v35, 27
+; GFX9-NEXT: v_readlane_b32 s58, v35, 26
+; GFX9-NEXT: v_readlane_b32 s57, v35, 25
+; GFX9-NEXT: v_readlane_b32 s56, v35, 24
+; GFX9-NEXT: v_readlane_b32 s55, v35, 23
+; GFX9-NEXT: v_readlane_b32 s54, v35, 22
+; GFX9-NEXT: v_readlane_b32 s53, v35, 21
+; GFX9-NEXT: v_readlane_b32 s52, v35, 20
+; GFX9-NEXT: v_readlane_b32 s51, v35, 19
+; GFX9-NEXT: v_readlane_b32 s50, v35, 18
+; GFX9-NEXT: v_readlane_b32 s49, v35, 17
+; GFX9-NEXT: v_readlane_b32 s48, v35, 16
+; GFX9-NEXT: v_readlane_b32 s47, v35, 15
+; GFX9-NEXT: v_readlane_b32 s46, v35, 14
+; GFX9-NEXT: v_readlane_b32 s45, v35, 13
+; GFX9-NEXT: v_readlane_b32 s44, v35, 12
+; GFX9-NEXT: v_readlane_b32 s43, v35, 11
+; GFX9-NEXT: v_readlane_b32 s42, v35, 10
+; GFX9-NEXT: v_readlane_b32 s41, v35, 9
+; GFX9-NEXT: v_readlane_b32 s40, v35, 8
+; GFX9-NEXT: v_readlane_b32 s39, v35, 7
+; GFX9-NEXT: v_readlane_b32 s38, v35, 6
+; GFX9-NEXT: v_readlane_b32 s37, v35, 5
+; GFX9-NEXT: v_readlane_b32 s36, v35, 4
+; GFX9-NEXT: v_readlane_b32 s35, v35, 3
+; GFX9-NEXT: v_readlane_b32 s34, v35, 2
+; GFX9-NEXT: v_readlane_b32 s31, v35, 1
+; GFX9-NEXT: v_readlane_b32 s30, v35, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
@@ -10101,14 +11509,18 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
@@ -10121,24 +11533,6 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
@@ -10149,20 +11543,88 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_add_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
+; GFX10-NEXT: v_add_f32_e32 v18, v27, v48
; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_add_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_add_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_add_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT: v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT: v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT: v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT: v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT: v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT: v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT: v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
; GFX10-NEXT: v_add_f32_e32 v51, v52, v51
; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
; GFX10-NEXT: v_add_f32_e32 v25, v54, v53
@@ -10172,142 +11634,423 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_add_f32_e32 v23, v66, v65
; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
; GFX10-NEXT: v_add_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_add_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_add_f32_e32 v29, v29, v36
-; GFX10-NEXT: v_add_f32_e32 v28, v28, v38
-; GFX10-NEXT: v_add_f32_e32 v27, v27, v48
-; GFX10-NEXT: v_add_f32_e32 v26, v26, v50
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT: v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT: v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT: v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT: v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT: v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT: v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT: v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT: v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT: v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT: v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT: v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT: v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT: v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT: v_readlane_b32 s34, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
+; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT: v_add_f32_e32 v17, v31, v33
+; GFX10-NEXT: v_add_f32_e32 v15, v15, v32
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT: v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT: v_add_f32_e32 v16, v32, v16
-; GFX10-NEXT: v_add_f32_e32 v15, v15, v17
-; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_b32 v32, off, s32
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT: v_add_f32_e32 v23, v66, v65
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_add_f32 v4, v4, v20
+; GFX11-NEXT: v_add_f32_e32 v21, v70, v69
+; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_add_f32 v1, v1, v17
+; GFX11-NEXT: v_add_f32_e32 v17, v86, v85
+; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT: v_dual_add_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT: v_add_f32_e32 v22, v68, v67
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT: v_add_f32_e32 v27, v50, v49
+; GFX11-NEXT: v_bfe_u32 v50, v10, 16, 1
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v18 :: v_dual_add_f32 v3, v3, v19
-; GFX11-NEXT: v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v10, v10, v26
-; GFX11-NEXT: v_add_f32_e32 v26, v52, v51
-; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT: v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT: v_add_f32_e32 v20, v80, v71
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT: v_dual_add_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT: v_dual_add_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_add_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_add_f32_e32 v9, v9, v25
; GFX11-NEXT: v_add_f32_e32 v25, v54, v53
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_add_f32_e32 v24, v64, v55
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_add_f32_e32 v23, v66, v65
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_dual_add_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: v_add_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_add_f32_e32 v37, v86, v85
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_add_f32_e32 v14, v14, v30
+; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT: v_add_f32_e32 v28, v48, v39
; GFX11-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
-; GFX11-NEXT: v_dual_add_f32 v34, v80, v71 :: v_dual_add_f32 v35, v82, v81
-; GFX11-NEXT: v_add_f32_e32 v36, v84, v83
-; GFX11-NEXT: v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v15, v15, v17
-; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT: v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT: v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT: v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT: v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT: v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT: v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT: v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT: v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT: v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT: v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT: v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT: v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT: v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT: v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT: v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT: v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT: v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT: v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT: v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT: v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT: v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT: v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT: v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT: v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT: v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT: v_add_f32_e32 v31, v31, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v15, v15, v32
+; GFX11-NEXT: v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT: v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fadd <32 x bfloat> %a, %b
ret <32 x bfloat> %op
@@ -10335,7 +12078,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_bf16_fpimm_0:
@@ -10343,7 +12092,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16_fpimm_0:
@@ -10351,7 +12106,11 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_bf16_fpimm_0:
@@ -10360,7 +12119,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%add = fadd bfloat %arg0, 1.0
ret bfloat %add
@@ -10388,7 +12153,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_bf16_fpimm_1:
@@ -10396,7 +12167,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16_fpimm_1:
@@ -10404,7 +12181,11 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fadd_bf16_fpimm_1:
@@ -10413,7 +12194,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%add = fadd bfloat %arg0, 42.0
ret bfloat %add
@@ -10444,7 +12231,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_bf16:
@@ -10453,7 +12246,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_bf16:
@@ -10462,7 +12261,11 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fsub_bf16:
@@ -10472,7 +12275,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fsub bfloat %a, %b
ret bfloat %op
@@ -10510,12 +12319,25 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v2bf16:
@@ -10523,11 +12345,23 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -10540,7 +12374,17 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fsub_v2bf16:
@@ -10553,8 +12397,22 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fsub <2 x bfloat> %a, %b
ret <2 x bfloat> %op
@@ -10601,15 +12459,34 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v3bf16:
@@ -10618,40 +12495,106 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fsub <3 x bfloat> %a, %b
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+;
+; GFX11-LABEL: v_fsub_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = fsub <3 x bfloat> %a, %b
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
@@ -10696,20 +12639,46 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v4bf16:
@@ -10717,17 +12686,39 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10738,17 +12729,37 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_sub_f32_e32 v3, v7, v6
+; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fsub_v4bf16:
@@ -10760,15 +12771,40 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fsub <4 x bfloat> %a, %b
ret <4 x bfloat> %op
@@ -10799,7 +12835,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_bf16:
@@ -10808,7 +12850,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_bf16:
@@ -10817,7 +12865,11 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_bf16:
@@ -10827,7 +12879,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul bfloat %a, %b
ret bfloat %op
@@ -10865,12 +12923,25 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v2bf16:
@@ -10878,11 +12949,23 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -10895,7 +12978,17 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v2bf16:
@@ -10908,8 +13001,22 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <2 x bfloat> %a, %b
ret <2 x bfloat> %op
@@ -10956,15 +13063,34 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v3bf16:
@@ -10973,32 +13099,98 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <3 x bfloat> %a, %b
ret <3 x bfloat> %op
}
@@ -11051,20 +13243,46 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v4bf16:
@@ -11072,17 +13290,39 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -11093,17 +13333,37 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_mul_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v4bf16:
@@ -11115,15 +13375,40 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_mul_f32_e32 v5, v7, v6
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <4 x bfloat> %a, %b
ret <4 x bfloat> %op
@@ -11209,36 +13494,86 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v8bf16:
@@ -11246,29 +13581,71 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7
+; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6
+; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5
+; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
@@ -11287,58 +13664,151 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_mul_f32_e32 v9, v11, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_mul_f32_e32 v10, v11, v10
-; GFX10-NEXT: v_mul_f32_e32 v11, v13, v12
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_mul_f32_e32 v10, v10, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_mul_f32_e32 v11, v12, v11
+; GFX10-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v11, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v5
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_dual_mul_f32 v10, v11, v10 :: v_dual_mul_f32 v11, v13, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v3, v3, v7
+; GFX11-NEXT: v_mul_f32_e32 v9, v11, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_dual_mul_f32 v10, v10, v12 :: v_dual_mul_f32 v1, v1, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f32_e32 v11, v12, v11
+; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <8 x bfloat> %a, %b
ret <8 x bfloat> %op
@@ -11492,68 +13962,166 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v15, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v15, v18, v15
+; GFX8-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v14, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v14, v18, v14
+; GFX8-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v13, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v13, v18, v13
+; GFX8-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v12, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v12, v18, v12
+; GFX8-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v11, v18, v11
+; GFX8-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v10, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v10, v18, v10
+; GFX8-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v9, v18, v9
+; GFX8-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v16, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v16bf16:
@@ -11561,53 +14129,135 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v15, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15
+; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v14, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14
+; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v13, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13
+; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v12, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12
+; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v11, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11
+; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10
+; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9
+; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
@@ -11625,119 +14275,294 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT: v_mul_f32_e32 v15, v17, v15
-; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
-; GFX10-NEXT: v_mul_f32_e32 v14, v19, v18
+; GFX10-NEXT: v_mul_f32_e32 v14, v20, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_mul_f32_e32 v13, v21, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v12
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v17, v20, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_mul_f32_e32 v18, v20, v19
+; GFX10-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT: v_mul_f32_e32 v12, v19, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_mul_f32_e32 v11, v19, v18
+; GFX10-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT: v_mul_f32_e32 v10, v21, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_mul_f32_e32 v19, v20, v19
-; GFX10-NEXT: v_mul_f32_e32 v20, v22, v21
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT: v_mul_f32_e32 v17, v19, v17
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_mul_f32_e32 v13, v21, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT: v_mul_f32_e32 v15, v17, v15
+; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v14, v20, v18 :: v_dual_mul_f32 v5, v5, v13
+; GFX11-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX11-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v18
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT: v_mul_f32_e32 v14, v19, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT: v_mul_f32_e32 v17, v20, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_mul_f32 v4, v4, v12
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_mul_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f32_e32 v12, v19, v18
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11-NEXT: v_mul_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_dual_mul_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f32_e32 v11, v19, v18
+; GFX11-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v19, v20, v19 :: v_dual_mul_f32 v20, v22, v21
-; GFX11-NEXT: v_mul_f32_e32 v1, v1, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT: v_dual_mul_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v17, v19, v17 :: v_dual_mul_f32 v2, v2, v10
+; GFX11-NEXT: v_mul_f32_e32 v10, v21, v20
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <16 x bfloat> %a, %b
ret <16 x bfloat> %op
@@ -12145,247 +14970,716 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-LABEL: v_fmul_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: v_writelane_b32 v35, s30, 0
+; GFX8-NEXT: v_writelane_b32 v35, s31, 1
+; GFX8-NEXT: v_writelane_b32 v35, s34, 2
+; GFX8-NEXT: v_writelane_b32 v35, s35, 3
+; GFX8-NEXT: v_writelane_b32 v35, s36, 4
+; GFX8-NEXT: v_writelane_b32 v35, s37, 5
+; GFX8-NEXT: v_writelane_b32 v35, s38, 6
+; GFX8-NEXT: v_writelane_b32 v35, s39, 7
+; GFX8-NEXT: v_writelane_b32 v35, s40, 8
+; GFX8-NEXT: v_writelane_b32 v35, s41, 9
+; GFX8-NEXT: v_writelane_b32 v35, s42, 10
+; GFX8-NEXT: v_writelane_b32 v35, s43, 11
+; GFX8-NEXT: v_writelane_b32 v35, s44, 12
+; GFX8-NEXT: v_writelane_b32 v35, s45, 13
+; GFX8-NEXT: v_writelane_b32 v35, s46, 14
+; GFX8-NEXT: v_writelane_b32 v35, s47, 15
+; GFX8-NEXT: v_writelane_b32 v35, s48, 16
+; GFX8-NEXT: v_writelane_b32 v35, s49, 17
+; GFX8-NEXT: v_writelane_b32 v35, s50, 18
+; GFX8-NEXT: v_writelane_b32 v35, s51, 19
+; GFX8-NEXT: v_writelane_b32 v35, s52, 20
+; GFX8-NEXT: v_writelane_b32 v35, s53, 21
+; GFX8-NEXT: v_writelane_b32 v35, s54, 22
+; GFX8-NEXT: v_writelane_b32 v35, s55, 23
+; GFX8-NEXT: v_writelane_b32 v35, s56, 24
+; GFX8-NEXT: v_writelane_b32 v35, s57, 25
+; GFX8-NEXT: v_writelane_b32 v35, s58, 26
+; GFX8-NEXT: v_writelane_b32 v35, s59, 27
+; GFX8-NEXT: v_writelane_b32 v35, s60, 28
+; GFX8-NEXT: v_writelane_b32 v35, s61, 29
+; GFX8-NEXT: v_writelane_b32 v35, s62, 30
+; GFX8-NEXT: v_writelane_b32 v35, s63, 31
+; GFX8-NEXT: v_writelane_b32 v35, s64, 32
+; GFX8-NEXT: v_writelane_b32 v35, s65, 33
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_writelane_b32 v35, s66, 34
; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31
; GFX8-NEXT: v_mul_f32_e32 v30, v14, v30
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT: v_writelane_b32 v35, s67, 35
+; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT: v_writelane_b32 v35, s68, 36
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT: s_movk_i32 s68, 0x7fff
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v13
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_mul_f32_e32 v14, v32, v14
-; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_mul_f32_e32 v31, v31, v30
+; GFX8-NEXT: v_mul_f32_e32 v29, v13, v29
+; GFX8-NEXT: v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v31, v31, v29
+; GFX8-NEXT: v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_mul_f32_e32 v29, v32, v29
-; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v28, v12, v28
+; GFX8-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT: v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT: v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT: v_mul_f32_e32 v31, v15, v31
+; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT: v_mul_f32_e32 v28, v28, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v11
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_mul_f32_e32 v28, v32, v28
-; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX8-NEXT: v_mul_f32_e32 v34, v34, v31
+; GFX8-NEXT: v_mul_f32_e32 v27, v11, v27
+; GFX8-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_mul_f32_e32 v27, v32, v27
-; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX8-NEXT: v_mul_f32_e32 v34, v34, v27
+; GFX8-NEXT: v_mul_f32_e32 v26, v10, v26
+; GFX8-NEXT: v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v9
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_mul_f32_e32 v26, v32, v26
-; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX8-NEXT: v_mul_f32_e32 v34, v34, v26
+; GFX8-NEXT: v_mul_f32_e32 v25, v9, v25
+; GFX8-NEXT: v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v8
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT: v_mul_f32_e32 v25, v32, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_mul_f32_e32 v32, v32, v33
-; GFX8-NEXT: v_mul_f32_e32 v15, v15, v24
+; GFX8-NEXT: v_mul_f32_e32 v34, v34, v25
+; GFX8-NEXT: v_mul_f32_e32 v24, v8, v24
+; GFX8-NEXT: v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v7
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24
-; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX8-NEXT: v_mul_f32_e32 v34, v34, v24
+; GFX8-NEXT: v_mul_f32_e32 v23, v7, v23
+; GFX8-NEXT: v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT: v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT: v_mul_f32_e32 v34, v34, v23
; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT: v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT: v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v6, v34, v6
+; GFX8-NEXT: v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22
-; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT: v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v5, v21, v5
+; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT: v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21
-; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT: v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v20, v4
+; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT: v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20
-; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT: v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, v19, v3
+; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT: v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19
-; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT: v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v2, v18, v2
+; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT: v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18
-; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT: v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v17, v1
+; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT: v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT: v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT: v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], v0, v28
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], s68, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT: v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT: v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT: v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT: v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT: v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT: v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT: v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT: v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT: v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT: v_readlane_b32 s68, v35, 36
+; GFX8-NEXT: v_readlane_b32 s67, v35, 35
+; GFX8-NEXT: v_readlane_b32 s66, v35, 34
+; GFX8-NEXT: v_readlane_b32 s65, v35, 33
+; GFX8-NEXT: v_readlane_b32 s64, v35, 32
+; GFX8-NEXT: v_readlane_b32 s63, v35, 31
+; GFX8-NEXT: v_readlane_b32 s62, v35, 30
+; GFX8-NEXT: v_readlane_b32 s61, v35, 29
+; GFX8-NEXT: v_readlane_b32 s60, v35, 28
+; GFX8-NEXT: v_readlane_b32 s59, v35, 27
+; GFX8-NEXT: v_readlane_b32 s58, v35, 26
+; GFX8-NEXT: v_readlane_b32 s57, v35, 25
+; GFX8-NEXT: v_readlane_b32 s56, v35, 24
+; GFX8-NEXT: v_readlane_b32 s55, v35, 23
+; GFX8-NEXT: v_readlane_b32 s54, v35, 22
+; GFX8-NEXT: v_readlane_b32 s53, v35, 21
+; GFX8-NEXT: v_readlane_b32 s52, v35, 20
+; GFX8-NEXT: v_readlane_b32 s51, v35, 19
+; GFX8-NEXT: v_readlane_b32 s50, v35, 18
+; GFX8-NEXT: v_readlane_b32 s49, v35, 17
+; GFX8-NEXT: v_readlane_b32 s48, v35, 16
+; GFX8-NEXT: v_readlane_b32 s47, v35, 15
+; GFX8-NEXT: v_readlane_b32 s46, v35, 14
+; GFX8-NEXT: v_readlane_b32 s45, v35, 13
+; GFX8-NEXT: v_readlane_b32 s44, v35, 12
+; GFX8-NEXT: v_readlane_b32 s43, v35, 11
+; GFX8-NEXT: v_readlane_b32 s42, v35, 10
+; GFX8-NEXT: v_readlane_b32 s41, v35, 9
+; GFX8-NEXT: v_readlane_b32 s40, v35, 8
+; GFX8-NEXT: v_readlane_b32 s39, v35, 7
+; GFX8-NEXT: v_readlane_b32 s38, v35, 6
+; GFX8-NEXT: v_readlane_b32 s37, v35, 5
+; GFX8-NEXT: v_readlane_b32 s36, v35, 4
+; GFX8-NEXT: v_readlane_b32 s35, v35, 3
+; GFX8-NEXT: v_readlane_b32 s34, v35, 2
+; GFX8-NEXT: v_readlane_b32 s31, v35, 1
+; GFX8-NEXT: v_readlane_b32 s30, v35, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT: v_writelane_b32 v35, s30, 0
+; GFX9-NEXT: v_writelane_b32 v35, s31, 1
+; GFX9-NEXT: v_writelane_b32 v35, s34, 2
+; GFX9-NEXT: v_writelane_b32 v35, s35, 3
+; GFX9-NEXT: v_writelane_b32 v35, s36, 4
+; GFX9-NEXT: v_writelane_b32 v35, s37, 5
+; GFX9-NEXT: v_writelane_b32 v35, s38, 6
+; GFX9-NEXT: v_writelane_b32 v35, s39, 7
+; GFX9-NEXT: v_writelane_b32 v35, s40, 8
+; GFX9-NEXT: v_writelane_b32 v35, s41, 9
+; GFX9-NEXT: v_writelane_b32 v35, s42, 10
+; GFX9-NEXT: v_writelane_b32 v35, s43, 11
+; GFX9-NEXT: v_writelane_b32 v35, s44, 12
+; GFX9-NEXT: v_writelane_b32 v35, s45, 13
+; GFX9-NEXT: v_writelane_b32 v35, s46, 14
+; GFX9-NEXT: v_writelane_b32 v35, s47, 15
+; GFX9-NEXT: v_writelane_b32 v35, s48, 16
+; GFX9-NEXT: v_writelane_b32 v35, s49, 17
+; GFX9-NEXT: v_writelane_b32 v35, s50, 18
+; GFX9-NEXT: v_writelane_b32 v35, s51, 19
+; GFX9-NEXT: v_writelane_b32 v35, s52, 20
+; GFX9-NEXT: v_writelane_b32 v35, s53, 21
+; GFX9-NEXT: v_writelane_b32 v35, s54, 22
+; GFX9-NEXT: v_writelane_b32 v35, s55, 23
+; GFX9-NEXT: v_writelane_b32 v35, s56, 24
+; GFX9-NEXT: v_writelane_b32 v35, s57, 25
+; GFX9-NEXT: v_writelane_b32 v35, s58, 26
+; GFX9-NEXT: v_writelane_b32 v35, s59, 27
+; GFX9-NEXT: v_writelane_b32 v35, s60, 28
+; GFX9-NEXT: v_writelane_b32 v35, s61, 29
+; GFX9-NEXT: v_writelane_b32 v35, s62, 30
+; GFX9-NEXT: v_writelane_b32 v35, s63, 31
+; GFX9-NEXT: v_writelane_b32 v35, s64, 32
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s65, 33
+; GFX9-NEXT: v_mul_f32_e32 v32, v32, v31
+; GFX9-NEXT: v_mul_f32_e32 v30, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s66, 34
+; GFX9-NEXT: s_movk_i32 s66, 0x7fff
+; GFX9-NEXT: v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT: v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT: v_mul_f32_e32 v32, v32, v30
+; GFX9-NEXT: v_mul_f32_e32 v29, v13, v29
+; GFX9-NEXT: v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT: v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT: v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_mul_f32_e32 v29, v32, v29
-; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29
+; GFX9-NEXT: v_mul_f32_e32 v28, v12, v28
+; GFX9-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT: v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT: v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT: v_mul_f32_e32 v32, v32, v28
+; GFX9-NEXT: v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_mul_f32_e32 v28, v32, v28
-; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT: v_mul_f32_e32 v15, v15, v33
+; GFX9-NEXT: v_mul_f32_e32 v27, v11, v27
+; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT: v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT: v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_mul_f32_e32 v27, v32, v27
-; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v27
+; GFX9-NEXT: v_mul_f32_e32 v26, v10, v26
+; GFX9-NEXT: v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT: v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT: v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_mul_f32_e32 v26, v32, v26
-; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v26
+; GFX9-NEXT: v_mul_f32_e32 v25, v9, v25
+; GFX9-NEXT: v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT: v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT: v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT: v_mul_f32_e32 v25, v32, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_mul_f32_e32 v32, v32, v33
-; GFX9-NEXT: v_mul_f32_e32 v15, v15, v24
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v25
+; GFX9-NEXT: v_mul_f32_e32 v24, v8, v24
+; GFX9-NEXT: v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT: v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT: v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v24
+; GFX9-NEXT: v_mul_f32_e32 v23, v7, v23
+; GFX9-NEXT: v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT: v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT: v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v23
+; GFX9-NEXT: v_mul_f32_e32 v22, v6, v22
+; GFX9-NEXT: v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT: v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT: v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[40:41], v22, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v22
+; GFX9-NEXT: v_mul_f32_e32 v21, v5, v21
+; GFX9-NEXT: v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT: v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT: v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[44:45], v21, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v21
+; GFX9-NEXT: v_mul_f32_e32 v20, v4, v20
+; GFX9-NEXT: v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT: v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT: v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[48:49], v20, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v20
+; GFX9-NEXT: v_mul_f32_e32 v19, v3, v19
+; GFX9-NEXT: v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT: v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT: v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[52:53], v19, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v19
+; GFX9-NEXT: v_mul_f32_e32 v18, v2, v18
+; GFX9-NEXT: v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT: v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT: v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[56:57], v18, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v18
+; GFX9-NEXT: v_mul_f32_e32 v17, v1, v17
+; GFX9-NEXT: v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT: v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[60:61], v17, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT: v_mul_f32_e32 v16, v0, v16
+; GFX9-NEXT: v_mul_f32_e32 v32, v32, v34
+; GFX9-NEXT: v_mul_f32_e32 v33, v33, v17
+; GFX9-NEXT: v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT: v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT: v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT: v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT: v_writelane_b32 v35, s67, 35
+; GFX9-NEXT: v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT: v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT: v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT: v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT: v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT: v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT: v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
@@ -12394,25 +15688,64 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4
+; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT: v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT: v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT: v_readlane_b32 s67, v35, 35
+; GFX9-NEXT: v_readlane_b32 s66, v35, 34
+; GFX9-NEXT: v_readlane_b32 s65, v35, 33
+; GFX9-NEXT: v_readlane_b32 s64, v35, 32
+; GFX9-NEXT: v_readlane_b32 s63, v35, 31
+; GFX9-NEXT: v_readlane_b32 s62, v35, 30
+; GFX9-NEXT: v_readlane_b32 s61, v35, 29
+; GFX9-NEXT: v_readlane_b32 s60, v35, 28
+; GFX9-NEXT: v_readlane_b32 s59, v35, 27
+; GFX9-NEXT: v_readlane_b32 s58, v35, 26
+; GFX9-NEXT: v_readlane_b32 s57, v35, 25
+; GFX9-NEXT: v_readlane_b32 s56, v35, 24
+; GFX9-NEXT: v_readlane_b32 s55, v35, 23
+; GFX9-NEXT: v_readlane_b32 s54, v35, 22
+; GFX9-NEXT: v_readlane_b32 s53, v35, 21
+; GFX9-NEXT: v_readlane_b32 s52, v35, 20
+; GFX9-NEXT: v_readlane_b32 s51, v35, 19
+; GFX9-NEXT: v_readlane_b32 s50, v35, 18
+; GFX9-NEXT: v_readlane_b32 s49, v35, 17
+; GFX9-NEXT: v_readlane_b32 s48, v35, 16
+; GFX9-NEXT: v_readlane_b32 s47, v35, 15
+; GFX9-NEXT: v_readlane_b32 s46, v35, 14
+; GFX9-NEXT: v_readlane_b32 s45, v35, 13
+; GFX9-NEXT: v_readlane_b32 s44, v35, 12
+; GFX9-NEXT: v_readlane_b32 s43, v35, 11
+; GFX9-NEXT: v_readlane_b32 s42, v35, 10
+; GFX9-NEXT: v_readlane_b32 s41, v35, 9
+; GFX9-NEXT: v_readlane_b32 s40, v35, 8
+; GFX9-NEXT: v_readlane_b32 s39, v35, 7
+; GFX9-NEXT: v_readlane_b32 s38, v35, 6
+; GFX9-NEXT: v_readlane_b32 s37, v35, 5
+; GFX9-NEXT: v_readlane_b32 s36, v35, 4
+; GFX9-NEXT: v_readlane_b32 s35, v35, 3
+; GFX9-NEXT: v_readlane_b32 s34, v35, 2
+; GFX9-NEXT: v_readlane_b32 s31, v35, 1
+; GFX9-NEXT: v_readlane_b32 s30, v35, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
@@ -12421,6 +15754,30 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
@@ -12433,7 +15790,73 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT: v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT: v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT: v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT: v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT: v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT: v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
@@ -12451,30 +15874,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT: v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53
@@ -12484,142 +15891,423 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65
; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_mul_f32_e32 v29, v29, v36
-; GFX10-NEXT: v_mul_f32_e32 v28, v28, v38
-; GFX10-NEXT: v_mul_f32_e32 v27, v27, v48
-; GFX10-NEXT: v_mul_f32_e32 v26, v26, v50
-; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT: v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT: v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT: v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT: v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT: v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT: v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT: v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT: v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT: v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT: v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT: v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT: v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT: v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT: v_readlane_b32 s34, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
+; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT: v_mul_f32_e32 v17, v31, v33
+; GFX10-NEXT: v_mul_f32_e32 v15, v15, v32
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT: v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT: v_mul_f32_e32 v16, v32, v16
-; GFX10-NEXT: v_mul_f32_e32 v15, v15, v17
-; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmul_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_b32 v32, off, s32
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT: v_mul_f32_e32 v23, v66, v65
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_mul_f32 v4, v4, v20
+; GFX11-NEXT: v_mul_f32_e32 v21, v70, v69
+; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_mul_f32 v1, v1, v17
+; GFX11-NEXT: v_mul_f32_e32 v17, v86, v85
+; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT: v_mul_f32_e32 v22, v68, v67
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT: v_mul_f32_e32 v27, v50, v49
+; GFX11-NEXT: v_bfe_u32 v50, v10, 16, 1
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_dual_mul_f32 v2, v2, v18 :: v_dual_mul_f32 v3, v3, v19
-; GFX11-NEXT: v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e32 v10, v10, v26
-; GFX11-NEXT: v_mul_f32_e32 v26, v52, v51
-; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT: v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_mul_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT: v_mul_f32_e32 v20, v80, v71
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT: v_dual_mul_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT: v_dual_mul_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX11-NEXT: v_mul_f32_e32 v25, v54, v53
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_mul_f32_e32 v24, v64, v55
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_mul_f32_e32 v23, v66, v65
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_dual_mul_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: v_mul_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_mul_f32_e32 v37, v86, v85
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39
; GFX11-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
-; GFX11-NEXT: v_dual_mul_f32 v34, v80, v71 :: v_dual_mul_f32 v35, v82, v81
-; GFX11-NEXT: v_mul_f32_e32 v36, v84, v83
-; GFX11-NEXT: v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v15, v15, v17
-; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT: v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT: v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT: v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT: v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT: v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT: v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT: v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT: v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT: v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT: v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT: v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT: v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT: v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT: v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT: v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT: v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT: v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT: v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT: v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT: v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT: v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT: v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT: v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT: v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT: v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT: v_mul_f32_e32 v31, v31, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v15, v15, v32
+; GFX11-NEXT: v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT: v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fmul <32 x bfloat> %a, %b
ret <32 x bfloat> %op
@@ -12679,8 +16367,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc0
; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fdiv_bf16:
@@ -12690,6 +16384,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_rcp_f32_e32 v4, v2
; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4
@@ -12698,8 +16393,13 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc0
; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_bf16:
@@ -12718,7 +16418,11 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_bf16:
@@ -12743,7 +16447,13 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fdiv bfloat %a, %b
ret bfloat %op
@@ -13074,7 +16784,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_bf16:
@@ -13083,7 +16799,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_bf16:
@@ -13092,7 +16814,11 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_bf16:
@@ -13102,7 +16828,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
ret bfloat %op
@@ -13148,12 +16880,25 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_min_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v2bf16:
@@ -13161,11 +16906,23 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_min_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_min_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -13178,7 +16935,17 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_min_f32_e32 v2, v3, v2
; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v2bf16:
@@ -13191,8 +16958,22 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min_f32_e32 v2, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %op
@@ -13251,49 +17032,134 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v3bf16:
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
ret <3 x bfloat> %op
}
@@ -13362,20 +17228,46 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v4bf16:
@@ -13383,17 +17275,39 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -13404,17 +17318,37 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_min_f32_e32 v3, v7, v6
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v4bf16:
@@ -13426,15 +17360,40 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
ret <4 x bfloat> %op
@@ -13552,36 +17511,86 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT: v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT: v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_min_f32_e32 v5, v9, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v8bf16:
@@ -13589,29 +17598,71 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_min_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_min_f32_e32 v7, v9, v7
+; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_min_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_min_f32_e32 v6, v9, v6
+; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_min_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_min_f32_e32 v5, v9, v5
+; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
@@ -13630,58 +17681,151 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: v_min_f32_e32 v8, v9, v8
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_min_f32_e32 v9, v11, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_min_f32_e32 v10, v11, v10
-; GFX10-NEXT: v_min_f32_e32 v11, v13, v12
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_min_f32_e32 v10, v10, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
; GFX10-NEXT: v_min_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_min_f32_e32 v11, v12, v11
+; GFX10-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT: v_min_f32_e32 v9, v11, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_dual_min_f32 v10, v10, v12 :: v_dual_min_f32 v1, v1, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v11, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_min_f32_e32 v11, v12, v11
+; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
; GFX11-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_dual_min_f32 v10, v11, v10 :: v_dual_min_f32 v11, v13, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
ret <8 x bfloat> %op
@@ -13899,68 +18043,166 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v15, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT: v_min_f32_e32 v15, v18, v15
+; GFX8-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v14, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT: v_min_f32_e32 v14, v18, v14
+; GFX8-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v13, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT: v_min_f32_e32 v13, v18, v13
+; GFX8-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v12, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT: v_min_f32_e32 v12, v18, v12
+; GFX8-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT: v_min_f32_e32 v11, v18, v11
+; GFX8-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v10, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_min_f32_e32 v10, v18, v10
+; GFX8-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_min_f32_e32 v9, v18, v9
+; GFX8-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_min_f32_e32 v0, v0, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_min_f32_e32 v9, v17, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v16, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v16bf16:
@@ -13968,53 +18210,135 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v15, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_min_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_min_f32_e32 v15, v17, v15
+; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v14, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_min_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_min_f32_e32 v14, v17, v14
+; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v13, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_min_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_min_f32_e32 v13, v17, v13
+; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v12, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_min_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_min_f32_e32 v12, v17, v12
+; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v11, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_min_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_min_f32_e32 v11, v17, v11
+; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_min_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_min_f32_e32 v10, v17, v10
+; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_min_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_min_f32_e32 v9, v17, v9
+; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
@@ -14032,119 +18356,294 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_min_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_min_f32_e32 v7, v7, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT: v_min_f32_e32 v15, v17, v15
-; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
-; GFX10-NEXT: v_min_f32_e32 v14, v19, v18
+; GFX10-NEXT: v_min_f32_e32 v14, v20, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX10-NEXT: v_min_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_min_f32_e32 v13, v21, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v12
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_min_f32_e32 v17, v20, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v12
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_min_f32_e32 v18, v20, v19
+; GFX10-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT: v_min_f32_e32 v12, v19, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_min_f32_e32 v11, v19, v18
+; GFX10-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT: v_min_f32_e32 v10, v21, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_min_f32_e32 v19, v20, v19
-; GFX10-NEXT: v_min_f32_e32 v20, v22, v21
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT: v_min_f32_e32 v17, v19, v17
; GFX10-NEXT: v_min_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_min_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_min_f32_e32 v13, v21, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
-; GFX11-NEXT: v_min_f32_e32 v7, v7, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT: v_min_f32_e32 v15, v17, v15
+; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_min_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT: v_dual_min_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_min_f32 v14, v20, v18 :: v_dual_min_f32 v5, v5, v13
+; GFX11-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX11-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v18
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_min_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT: v_min_f32_e32 v14, v19, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT: v_min_f32_e32 v17, v20, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_min_f32 v4, v4, v12
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_min_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_min_f32_e32 v12, v19, v18
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11-NEXT: v_min_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_dual_min_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_min_f32_e32 v11, v19, v18
+; GFX11-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_min_f32 v19, v20, v19 :: v_dual_min_f32 v20, v22, v21
-; GFX11-NEXT: v_min_f32_e32 v1, v1, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT: v_dual_min_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v17, v19, v17 :: v_dual_min_f32 v2, v2, v10
+; GFX11-NEXT: v_min_f32_e32 v10, v21, v20
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
ret <16 x bfloat> %op
@@ -14680,247 +19179,716 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-LABEL: v_minnum_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: v_writelane_b32 v35, s30, 0
+; GFX8-NEXT: v_writelane_b32 v35, s31, 1
+; GFX8-NEXT: v_writelane_b32 v35, s34, 2
+; GFX8-NEXT: v_writelane_b32 v35, s35, 3
+; GFX8-NEXT: v_writelane_b32 v35, s36, 4
+; GFX8-NEXT: v_writelane_b32 v35, s37, 5
+; GFX8-NEXT: v_writelane_b32 v35, s38, 6
+; GFX8-NEXT: v_writelane_b32 v35, s39, 7
+; GFX8-NEXT: v_writelane_b32 v35, s40, 8
+; GFX8-NEXT: v_writelane_b32 v35, s41, 9
+; GFX8-NEXT: v_writelane_b32 v35, s42, 10
+; GFX8-NEXT: v_writelane_b32 v35, s43, 11
+; GFX8-NEXT: v_writelane_b32 v35, s44, 12
+; GFX8-NEXT: v_writelane_b32 v35, s45, 13
+; GFX8-NEXT: v_writelane_b32 v35, s46, 14
+; GFX8-NEXT: v_writelane_b32 v35, s47, 15
+; GFX8-NEXT: v_writelane_b32 v35, s48, 16
+; GFX8-NEXT: v_writelane_b32 v35, s49, 17
+; GFX8-NEXT: v_writelane_b32 v35, s50, 18
+; GFX8-NEXT: v_writelane_b32 v35, s51, 19
+; GFX8-NEXT: v_writelane_b32 v35, s52, 20
+; GFX8-NEXT: v_writelane_b32 v35, s53, 21
+; GFX8-NEXT: v_writelane_b32 v35, s54, 22
+; GFX8-NEXT: v_writelane_b32 v35, s55, 23
+; GFX8-NEXT: v_writelane_b32 v35, s56, 24
+; GFX8-NEXT: v_writelane_b32 v35, s57, 25
+; GFX8-NEXT: v_writelane_b32 v35, s58, 26
+; GFX8-NEXT: v_writelane_b32 v35, s59, 27
+; GFX8-NEXT: v_writelane_b32 v35, s60, 28
+; GFX8-NEXT: v_writelane_b32 v35, s61, 29
+; GFX8-NEXT: v_writelane_b32 v35, s62, 30
+; GFX8-NEXT: v_writelane_b32 v35, s63, 31
+; GFX8-NEXT: v_writelane_b32 v35, s64, 32
+; GFX8-NEXT: v_writelane_b32 v35, s65, 33
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_writelane_b32 v35, s66, 34
; GFX8-NEXT: v_min_f32_e32 v31, v32, v31
; GFX8-NEXT: v_min_f32_e32 v30, v14, v30
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT: v_writelane_b32 v35, s67, 35
+; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT: v_writelane_b32 v35, s68, 36
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT: s_movk_i32 s68, 0x7fff
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v13
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_min_f32_e32 v14, v32, v14
-; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_min_f32_e32 v31, v31, v30
+; GFX8-NEXT: v_min_f32_e32 v29, v13, v29
+; GFX8-NEXT: v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT: v_min_f32_e32 v31, v31, v29
+; GFX8-NEXT: v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_min_f32_e32 v29, v32, v29
-; GFX8-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT: v_min_f32_e32 v28, v12, v28
+; GFX8-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT: v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT: v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT: v_min_f32_e32 v31, v15, v31
+; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT: v_min_f32_e32 v28, v28, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v11
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_min_f32_e32 v28, v32, v28
-; GFX8-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT: v_min_f32_e32 v34, v34, v31
+; GFX8-NEXT: v_min_f32_e32 v27, v11, v27
+; GFX8-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_min_f32_e32 v27, v32, v27
-; GFX8-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT: v_min_f32_e32 v34, v34, v27
+; GFX8-NEXT: v_min_f32_e32 v26, v10, v26
+; GFX8-NEXT: v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v9
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_min_f32_e32 v26, v32, v26
-; GFX8-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT: v_min_f32_e32 v34, v34, v26
+; GFX8-NEXT: v_min_f32_e32 v25, v9, v25
+; GFX8-NEXT: v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v8
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT: v_min_f32_e32 v25, v32, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_min_f32_e32 v32, v32, v33
-; GFX8-NEXT: v_min_f32_e32 v15, v15, v24
+; GFX8-NEXT: v_min_f32_e32 v34, v34, v25
+; GFX8-NEXT: v_min_f32_e32 v24, v8, v24
+; GFX8-NEXT: v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v7
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_min_f32_e32 v24, v33, v24
-; GFX8-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT: v_min_f32_e32 v34, v34, v24
+; GFX8-NEXT: v_min_f32_e32 v23, v7, v23
+; GFX8-NEXT: v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT: v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT: v_min_f32_e32 v34, v34, v23
; GFX8-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT: v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT: v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT: v_min_f32_e32 v6, v34, v6
+; GFX8-NEXT: v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_min_f32_e32 v22, v33, v22
-; GFX8-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v6
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT: v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT: v_min_f32_e32 v5, v21, v5
+; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT: v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v21, v33, v21
-; GFX8-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v5
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT: v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT: v_min_f32_e32 v4, v20, v4
+; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT: v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_min_f32_e32 v20, v33, v20
-; GFX8-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT: v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT: v_min_f32_e32 v3, v19, v3
+; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT: v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_min_f32_e32 v19, v33, v19
-; GFX8-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT: v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_min_f32_e32 v2, v18, v2
+; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT: v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v18, v33, v18
-; GFX8-NEXT: v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT: v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_min_f32_e32 v1, v17, v1
+; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT: v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX8-NEXT: v_min_f32_e32 v17, v33, v17
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT: v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT: v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], v0, v28
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], s68, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT: v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT: v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT: v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT: v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT: v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT: v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT: v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT: v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT: v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT: v_readlane_b32 s68, v35, 36
+; GFX8-NEXT: v_readlane_b32 s67, v35, 35
+; GFX8-NEXT: v_readlane_b32 s66, v35, 34
+; GFX8-NEXT: v_readlane_b32 s65, v35, 33
+; GFX8-NEXT: v_readlane_b32 s64, v35, 32
+; GFX8-NEXT: v_readlane_b32 s63, v35, 31
+; GFX8-NEXT: v_readlane_b32 s62, v35, 30
+; GFX8-NEXT: v_readlane_b32 s61, v35, 29
+; GFX8-NEXT: v_readlane_b32 s60, v35, 28
+; GFX8-NEXT: v_readlane_b32 s59, v35, 27
+; GFX8-NEXT: v_readlane_b32 s58, v35, 26
+; GFX8-NEXT: v_readlane_b32 s57, v35, 25
+; GFX8-NEXT: v_readlane_b32 s56, v35, 24
+; GFX8-NEXT: v_readlane_b32 s55, v35, 23
+; GFX8-NEXT: v_readlane_b32 s54, v35, 22
+; GFX8-NEXT: v_readlane_b32 s53, v35, 21
+; GFX8-NEXT: v_readlane_b32 s52, v35, 20
+; GFX8-NEXT: v_readlane_b32 s51, v35, 19
+; GFX8-NEXT: v_readlane_b32 s50, v35, 18
+; GFX8-NEXT: v_readlane_b32 s49, v35, 17
+; GFX8-NEXT: v_readlane_b32 s48, v35, 16
+; GFX8-NEXT: v_readlane_b32 s47, v35, 15
+; GFX8-NEXT: v_readlane_b32 s46, v35, 14
+; GFX8-NEXT: v_readlane_b32 s45, v35, 13
+; GFX8-NEXT: v_readlane_b32 s44, v35, 12
+; GFX8-NEXT: v_readlane_b32 s43, v35, 11
+; GFX8-NEXT: v_readlane_b32 s42, v35, 10
+; GFX8-NEXT: v_readlane_b32 s41, v35, 9
+; GFX8-NEXT: v_readlane_b32 s40, v35, 8
+; GFX8-NEXT: v_readlane_b32 s39, v35, 7
+; GFX8-NEXT: v_readlane_b32 s38, v35, 6
+; GFX8-NEXT: v_readlane_b32 s37, v35, 5
+; GFX8-NEXT: v_readlane_b32 s36, v35, 4
+; GFX8-NEXT: v_readlane_b32 s35, v35, 3
+; GFX8-NEXT: v_readlane_b32 s34, v35, 2
+; GFX8-NEXT: v_readlane_b32 s31, v35, 1
+; GFX8-NEXT: v_readlane_b32 s30, v35, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT: v_writelane_b32 v35, s30, 0
+; GFX9-NEXT: v_writelane_b32 v35, s31, 1
+; GFX9-NEXT: v_writelane_b32 v35, s34, 2
+; GFX9-NEXT: v_writelane_b32 v35, s35, 3
+; GFX9-NEXT: v_writelane_b32 v35, s36, 4
+; GFX9-NEXT: v_writelane_b32 v35, s37, 5
+; GFX9-NEXT: v_writelane_b32 v35, s38, 6
+; GFX9-NEXT: v_writelane_b32 v35, s39, 7
+; GFX9-NEXT: v_writelane_b32 v35, s40, 8
+; GFX9-NEXT: v_writelane_b32 v35, s41, 9
+; GFX9-NEXT: v_writelane_b32 v35, s42, 10
+; GFX9-NEXT: v_writelane_b32 v35, s43, 11
+; GFX9-NEXT: v_writelane_b32 v35, s44, 12
+; GFX9-NEXT: v_writelane_b32 v35, s45, 13
+; GFX9-NEXT: v_writelane_b32 v35, s46, 14
+; GFX9-NEXT: v_writelane_b32 v35, s47, 15
+; GFX9-NEXT: v_writelane_b32 v35, s48, 16
+; GFX9-NEXT: v_writelane_b32 v35, s49, 17
+; GFX9-NEXT: v_writelane_b32 v35, s50, 18
+; GFX9-NEXT: v_writelane_b32 v35, s51, 19
+; GFX9-NEXT: v_writelane_b32 v35, s52, 20
+; GFX9-NEXT: v_writelane_b32 v35, s53, 21
+; GFX9-NEXT: v_writelane_b32 v35, s54, 22
+; GFX9-NEXT: v_writelane_b32 v35, s55, 23
+; GFX9-NEXT: v_writelane_b32 v35, s56, 24
+; GFX9-NEXT: v_writelane_b32 v35, s57, 25
+; GFX9-NEXT: v_writelane_b32 v35, s58, 26
+; GFX9-NEXT: v_writelane_b32 v35, s59, 27
+; GFX9-NEXT: v_writelane_b32 v35, s60, 28
+; GFX9-NEXT: v_writelane_b32 v35, s61, 29
+; GFX9-NEXT: v_writelane_b32 v35, s62, 30
+; GFX9-NEXT: v_writelane_b32 v35, s63, 31
+; GFX9-NEXT: v_writelane_b32 v35, s64, 32
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s65, 33
+; GFX9-NEXT: v_min_f32_e32 v32, v32, v31
+; GFX9-NEXT: v_min_f32_e32 v30, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s66, 34
+; GFX9-NEXT: s_movk_i32 s66, 0x7fff
+; GFX9-NEXT: v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT: v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_min_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT: v_min_f32_e32 v32, v32, v30
+; GFX9-NEXT: v_min_f32_e32 v29, v13, v29
+; GFX9-NEXT: v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT: v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT: v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_min_f32_e32 v29, v32, v29
-; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT: v_min_f32_e32 v32, v32, v29
+; GFX9-NEXT: v_min_f32_e32 v28, v12, v28
+; GFX9-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT: v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT: v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT: v_min_f32_e32 v32, v32, v28
+; GFX9-NEXT: v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_min_f32_e32 v28, v32, v28
-; GFX9-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT: v_min_f32_e32 v15, v15, v33
+; GFX9-NEXT: v_min_f32_e32 v27, v11, v27
+; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT: v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT: v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_min_f32_e32 v27, v32, v27
-; GFX9-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v27
+; GFX9-NEXT: v_min_f32_e32 v26, v10, v26
+; GFX9-NEXT: v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT: v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT: v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_min_f32_e32 v26, v32, v26
-; GFX9-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v26
+; GFX9-NEXT: v_min_f32_e32 v25, v9, v25
+; GFX9-NEXT: v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT: v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT: v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT: v_min_f32_e32 v25, v32, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_min_f32_e32 v32, v32, v33
-; GFX9-NEXT: v_min_f32_e32 v15, v15, v24
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v25
+; GFX9-NEXT: v_min_f32_e32 v24, v8, v24
+; GFX9-NEXT: v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT: v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT: v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_min_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v24
+; GFX9-NEXT: v_min_f32_e32 v23, v7, v23
+; GFX9-NEXT: v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT: v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT: v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_min_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v23
+; GFX9-NEXT: v_min_f32_e32 v22, v6, v22
+; GFX9-NEXT: v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT: v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT: v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[40:41], v22, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_min_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v22
+; GFX9-NEXT: v_min_f32_e32 v21, v5, v21
+; GFX9-NEXT: v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT: v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT: v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[44:45], v21, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_min_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v21
+; GFX9-NEXT: v_min_f32_e32 v20, v4, v20
+; GFX9-NEXT: v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT: v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT: v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[48:49], v20, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_min_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v20
+; GFX9-NEXT: v_min_f32_e32 v19, v3, v19
+; GFX9-NEXT: v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT: v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT: v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[52:53], v19, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_min_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v19
+; GFX9-NEXT: v_min_f32_e32 v18, v2, v18
+; GFX9-NEXT: v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT: v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT: v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[56:57], v18, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_min_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v18
+; GFX9-NEXT: v_min_f32_e32 v17, v1, v17
+; GFX9-NEXT: v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT: v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[60:61], v17, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_min_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT: v_min_f32_e32 v16, v0, v16
+; GFX9-NEXT: v_min_f32_e32 v32, v32, v34
+; GFX9-NEXT: v_min_f32_e32 v33, v33, v17
+; GFX9-NEXT: v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT: v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT: v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT: v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT: v_writelane_b32 v35, s67, 35
+; GFX9-NEXT: v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT: v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT: v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT: v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT: v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT: v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT: v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
@@ -14929,13 +19897,72 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4
+; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT: v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT: v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT: v_readlane_b32 s67, v35, 35
+; GFX9-NEXT: v_readlane_b32 s66, v35, 34
+; GFX9-NEXT: v_readlane_b32 s65, v35, 33
+; GFX9-NEXT: v_readlane_b32 s64, v35, 32
+; GFX9-NEXT: v_readlane_b32 s63, v35, 31
+; GFX9-NEXT: v_readlane_b32 s62, v35, 30
+; GFX9-NEXT: v_readlane_b32 s61, v35, 29
+; GFX9-NEXT: v_readlane_b32 s60, v35, 28
+; GFX9-NEXT: v_readlane_b32 s59, v35, 27
+; GFX9-NEXT: v_readlane_b32 s58, v35, 26
+; GFX9-NEXT: v_readlane_b32 s57, v35, 25
+; GFX9-NEXT: v_readlane_b32 s56, v35, 24
+; GFX9-NEXT: v_readlane_b32 s55, v35, 23
+; GFX9-NEXT: v_readlane_b32 s54, v35, 22
+; GFX9-NEXT: v_readlane_b32 s53, v35, 21
+; GFX9-NEXT: v_readlane_b32 s52, v35, 20
+; GFX9-NEXT: v_readlane_b32 s51, v35, 19
+; GFX9-NEXT: v_readlane_b32 s50, v35, 18
+; GFX9-NEXT: v_readlane_b32 s49, v35, 17
+; GFX9-NEXT: v_readlane_b32 s48, v35, 16
+; GFX9-NEXT: v_readlane_b32 s47, v35, 15
+; GFX9-NEXT: v_readlane_b32 s46, v35, 14
+; GFX9-NEXT: v_readlane_b32 s45, v35, 13
+; GFX9-NEXT: v_readlane_b32 s44, v35, 12
+; GFX9-NEXT: v_readlane_b32 s43, v35, 11
+; GFX9-NEXT: v_readlane_b32 s42, v35, 10
+; GFX9-NEXT: v_readlane_b32 s41, v35, 9
+; GFX9-NEXT: v_readlane_b32 s40, v35, 8
+; GFX9-NEXT: v_readlane_b32 s39, v35, 7
+; GFX9-NEXT: v_readlane_b32 s38, v35, 6
+; GFX9-NEXT: v_readlane_b32 s37, v35, 5
+; GFX9-NEXT: v_readlane_b32 s36, v35, 4
+; GFX9-NEXT: v_readlane_b32 s35, v35, 3
+; GFX9-NEXT: v_readlane_b32 s34, v35, 2
+; GFX9-NEXT: v_readlane_b32 s31, v35, 1
+; GFX9-NEXT: v_readlane_b32 s30, v35, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
@@ -14948,14 +19975,18 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
@@ -14968,7 +19999,73 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_min_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT: v_min_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_min_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_min_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_min_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT: v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT: v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT: v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT: v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT: v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT: v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
@@ -14986,30 +20083,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT: v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
; GFX10-NEXT: v_min_f32_e32 v25, v54, v53
@@ -15019,142 +20100,423 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_min_f32_e32 v23, v66, v65
; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
; GFX10-NEXT: v_min_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_min_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_min_f32_e32 v29, v29, v36
-; GFX10-NEXT: v_min_f32_e32 v28, v28, v38
-; GFX10-NEXT: v_min_f32_e32 v27, v27, v48
-; GFX10-NEXT: v_min_f32_e32 v26, v26, v50
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT: v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT: v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT: v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT: v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT: v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT: v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT: v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT: v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT: v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT: v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT: v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT: v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT: v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT: v_readlane_b32 s34, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
+; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT: v_min_f32_e32 v17, v31, v33
+; GFX10-NEXT: v_min_f32_e32 v15, v15, v32
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT: v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT: v_min_f32_e32 v16, v32, v16
-; GFX10-NEXT: v_min_f32_e32 v15, v15, v17
-; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_minnum_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX11-NEXT: v_min_f32_e32 v23, v66, v65
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v4, v4, v20
+; GFX11-NEXT: v_min_f32_e32 v21, v70, v69
+; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
+; GFX11-NEXT: v_min_f32_e32 v17, v86, v85
+; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT: v_dual_min_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT: v_min_f32_e32 v22, v68, v67
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT: v_min_f32_e32 v27, v50, v49
+; GFX11-NEXT: v_bfe_u32 v50, v10, 16, 1
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_dual_min_f32 v2, v2, v18 :: v_dual_min_f32 v3, v3, v19
-; GFX11-NEXT: v_dual_min_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX11-NEXT: v_min_f32_e32 v26, v52, v51
-; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT: v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_min_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT: v_min_f32_e32 v20, v80, v71
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT: v_dual_min_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT: v_dual_min_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_min_f32_e32 v9, v9, v25
; GFX11-NEXT: v_min_f32_e32 v25, v54, v53
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_min_f32_e32 v24, v64, v55
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_min_f32_e32 v23, v66, v65
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_dual_min_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: v_min_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_min_f32_e32 v37, v86, v85
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT: v_min_f32_e32 v28, v48, v39
; GFX11-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
-; GFX11-NEXT: v_dual_min_f32 v34, v80, v71 :: v_dual_min_f32 v35, v82, v81
-; GFX11-NEXT: v_min_f32_e32 v36, v84, v83
-; GFX11-NEXT: v_dual_min_f32 v16, v32, v16 :: v_dual_min_f32 v15, v15, v17
-; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT: v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT: v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT: v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT: v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT: v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT: v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT: v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT: v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT: v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT: v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT: v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT: v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT: v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT: v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT: v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT: v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT: v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT: v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT: v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT: v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT: v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT: v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT: v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT: v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT: v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT: v_min_f32_e32 v31, v31, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v15, v15, v32
+; GFX11-NEXT: v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT: v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
ret <32 x bfloat> %op
@@ -15198,7 +20560,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_bf16:
@@ -15207,7 +20575,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_bf16:
@@ -15216,7 +20590,11 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_bf16:
@@ -15226,7 +20604,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
ret bfloat %op
@@ -15272,12 +20656,25 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_max_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v2bf16:
@@ -15285,11 +20682,23 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_max_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -15302,7 +20711,17 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_max_f32_e32 v2, v3, v2
; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v2bf16:
@@ -15315,8 +20734,22 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
; GFX11-NEXT: v_max_f32_e32 v2, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %op
@@ -15375,15 +20808,34 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v3bf16:
@@ -15392,32 +20844,98 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v3, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
ret <3 x bfloat> %op
}
@@ -15486,20 +21004,46 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v4bf16:
@@ -15507,17 +21051,39 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -15528,17 +21094,37 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_max_f32_e32 v3, v7, v6
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v4bf16:
@@ -15550,15 +21136,40 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
ret <4 x bfloat> %op
@@ -15676,36 +21287,86 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT: v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT: v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_max_f32_e32 v5, v9, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v8bf16:
@@ -15713,29 +21374,71 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v7, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_max_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_max_f32_e32 v7, v9, v7
+; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_max_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_max_f32_e32 v6, v9, v6
+; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_max_f32_e32 v5, v9, v5
+; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
@@ -15754,58 +21457,151 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX10-NEXT: v_max_f32_e32 v8, v9, v8
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_max_f32_e32 v9, v11, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_max_f32_e32 v10, v11, v10
-; GFX10-NEXT: v_max_f32_e32 v11, v13, v12
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
; GFX10-NEXT: v_max_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT: v_max_f32_e32 v11, v12, v11
+; GFX10-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT: v_max_f32_e32 v9, v11, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_dual_max_f32 v10, v10, v12 :: v_dual_max_f32 v1, v1, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v11, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_f32_e32 v11, v12, v11
+; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
; GFX11-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_dual_max_f32 v10, v11, v10 :: v_dual_max_f32 v11, v13, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v12, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v11, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v12, v12, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
ret <8 x bfloat> %op
@@ -16023,68 +21819,166 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v15, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT: v_max_f32_e32 v15, v18, v15
+; GFX8-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v14, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT: v_max_f32_e32 v14, v18, v14
+; GFX8-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v13, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; GFX8-NEXT: v_max_f32_e32 v13, v18, v13
+; GFX8-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v12, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; GFX8-NEXT: v_max_f32_e32 v12, v18, v12
+; GFX8-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; GFX8-NEXT: v_max_f32_e32 v11, v18, v11
+; GFX8-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v10, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_max_f32_e32 v10, v18, v10
+; GFX8-NEXT: v_bfe_u32 v18, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_max_f32_e32 v9, v18, v9
+; GFX8-NEXT: v_bfe_u32 v18, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_max_f32_e32 v0, v0, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_max_f32_e32 v9, v17, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v13, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v14, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v16, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v16bf16:
@@ -16092,53 +21986,135 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_mov_b32_e32 v18, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v15, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT: v_max_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_max_f32_e32 v15, v17, v15
+; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v15
+; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v14, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_max_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_max_f32_e32 v14, v17, v14
+; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v14
+; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v13, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT: v_max_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_max_f32_e32 v13, v17, v13
+; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v13
+; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v12, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_max_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_max_f32_e32 v12, v17, v12
+; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v12
+; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v11, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_max_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_max_f32_e32 v11, v17, v11
+; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v11
+; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT: v_max_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_max_f32_e32 v10, v17, v10
+; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v10
+; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_max_f32_e32 v9, v17, v9
+; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
+; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
@@ -16156,119 +22132,294 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_max_f32_e32 v16, v17, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_max_f32_e32 v7, v7, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v15, v18, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX10-NEXT: v_max_f32_e32 v15, v17, v15
-; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
-; GFX10-NEXT: v_max_f32_e32 v14, v19, v18
+; GFX10-NEXT: v_max_f32_e32 v14, v20, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX10-NEXT: v_max_f32_e32 v5, v5, v13
-; GFX10-NEXT: v_max_f32_e32 v13, v21, v20
-; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX10-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v12
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v17, v20, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v12
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_max_f32_e32 v18, v20, v19
+; GFX10-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX10-NEXT: v_max_f32_e32 v12, v19, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_max_f32_e32 v11, v19, v18
+; GFX10-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX10-NEXT: v_max_f32_e32 v10, v21, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_max_f32_e32 v19, v20, v19
-; GFX10-NEXT: v_max_f32_e32 v20, v22, v21
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX10-NEXT: v_max_f32_e32 v17, v19, v17
; GFX10-NEXT: v_max_f32_e32 v0, v0, v8
-; GFX10-NEXT: v_max_f32_e32 v1, v1, v9
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v12
-; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX10-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v13
-; GFX11-NEXT: v_max_f32_e32 v13, v21, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_max_f32_e32 v7, v7, v15
; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT: v_max_f32_e32 v15, v17, v15
+; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v15, v18, v15 :: v_dual_lshlrev_b32 v18, 16, v13
+; GFX11-NEXT: v_dual_max_f32 v6, v6, v14 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v14, v20, v18 :: v_dual_max_f32 v5, v5, v13
+; GFX11-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX11-NEXT: v_bfe_u32 v20, v14, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v16, v16
+; GFX11-NEXT: v_add3_u32 v17, v17, v16, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_add3_u32 v17, v19, v7, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v18
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_max_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
-; GFX11-NEXT: v_max_f32_e32 v14, v19, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT: v_add3_u32 v17, v19, v6, 0x7fff
+; GFX11-NEXT: v_add3_u32 v18, v20, v14, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x5040100
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v18
+; GFX11-NEXT: v_add3_u32 v18, v19, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v13, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v15, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v11
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_perm_b32 v6, v6, v13, 0x5040100
+; GFX11-NEXT: v_max_f32_e32 v17, v20, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, 0x7fc0, v15 :: v_dual_max_f32 v4, v4, v12
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_max_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302
-; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_max_f32_e32 v12, v19, v18
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11-NEXT: v_max_f32_e32 v3, v3, v11
-; GFX11-NEXT: v_dual_max_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT: v_add3_u32 v15, v15, v17, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v12, 16, 1
+; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_dual_cndmask_b32 v15, 0x7fc0, v15 :: v_dual_lshlrev_b32 v18, 16, v10
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_f32_e32 v11, v19, v18
+; GFX11-NEXT: v_add3_u32 v18, v20, v4, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v21, v12, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
+; GFX11-NEXT: v_bfe_u32 v21, v11, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v20, v21, v11, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_max_f32 v19, v20, v19 :: v_dual_max_f32 v20, v22, v21
-; GFX11-NEXT: v_max_f32_e32 v1, v1, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v17, v19, v17 :: v_dual_max_f32 v2, v2, v10
+; GFX11-NEXT: v_max_f32_e32 v10, v21, v20
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v20, v10, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v19, v20, v10, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v19, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v20, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v2, v11, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v4, v4, v15, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
ret <16 x bfloat> %op
@@ -16804,247 +22955,716 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-LABEL: v_maxnum_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: v_writelane_b32 v35, s30, 0
+; GFX8-NEXT: v_writelane_b32 v35, s31, 1
+; GFX8-NEXT: v_writelane_b32 v35, s34, 2
+; GFX8-NEXT: v_writelane_b32 v35, s35, 3
+; GFX8-NEXT: v_writelane_b32 v35, s36, 4
+; GFX8-NEXT: v_writelane_b32 v35, s37, 5
+; GFX8-NEXT: v_writelane_b32 v35, s38, 6
+; GFX8-NEXT: v_writelane_b32 v35, s39, 7
+; GFX8-NEXT: v_writelane_b32 v35, s40, 8
+; GFX8-NEXT: v_writelane_b32 v35, s41, 9
+; GFX8-NEXT: v_writelane_b32 v35, s42, 10
+; GFX8-NEXT: v_writelane_b32 v35, s43, 11
+; GFX8-NEXT: v_writelane_b32 v35, s44, 12
+; GFX8-NEXT: v_writelane_b32 v35, s45, 13
+; GFX8-NEXT: v_writelane_b32 v35, s46, 14
+; GFX8-NEXT: v_writelane_b32 v35, s47, 15
+; GFX8-NEXT: v_writelane_b32 v35, s48, 16
+; GFX8-NEXT: v_writelane_b32 v35, s49, 17
+; GFX8-NEXT: v_writelane_b32 v35, s50, 18
+; GFX8-NEXT: v_writelane_b32 v35, s51, 19
+; GFX8-NEXT: v_writelane_b32 v35, s52, 20
+; GFX8-NEXT: v_writelane_b32 v35, s53, 21
+; GFX8-NEXT: v_writelane_b32 v35, s54, 22
+; GFX8-NEXT: v_writelane_b32 v35, s55, 23
+; GFX8-NEXT: v_writelane_b32 v35, s56, 24
+; GFX8-NEXT: v_writelane_b32 v35, s57, 25
+; GFX8-NEXT: v_writelane_b32 v35, s58, 26
+; GFX8-NEXT: v_writelane_b32 v35, s59, 27
+; GFX8-NEXT: v_writelane_b32 v35, s60, 28
+; GFX8-NEXT: v_writelane_b32 v35, s61, 29
+; GFX8-NEXT: v_writelane_b32 v35, s62, 30
+; GFX8-NEXT: v_writelane_b32 v35, s63, 31
+; GFX8-NEXT: v_writelane_b32 v35, s64, 32
+; GFX8-NEXT: v_writelane_b32 v35, s65, 33
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_writelane_b32 v35, s66, 34
; GFX8-NEXT: v_max_f32_e32 v31, v32, v31
; GFX8-NEXT: v_max_f32_e32 v30, v14, v30
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT: v_writelane_b32 v35, s67, 35
+; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX8-NEXT: v_writelane_b32 v35, s68, 36
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT: s_movk_i32 s68, 0x7fff
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, s68, v32
+; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], s68, v14
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v13
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT: v_max_f32_e32 v14, v32, v14
-; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_max_f32_e32 v31, v31, v30
+; GFX8-NEXT: v_max_f32_e32 v29, v13, v29
+; GFX8-NEXT: v_bfe_u32 v30, v31, 16, 1
+; GFX8-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], v30, v31
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], v13, v29
+; GFX8-NEXT: v_add_u32_e64 v30, s[6:7], s68, v30
+; GFX8-NEXT: v_add_u32_e64 v13, s[8:9], s68, v13
+; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v31, v31
+; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v12
+; GFX8-NEXT: v_max_f32_e32 v31, v31, v29
+; GFX8-NEXT: v_bfe_u32 v29, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], v29, v31
+; GFX8-NEXT: v_add_u32_e64 v29, s[10:11], s68, v29
+; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v31, v31
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT: v_max_f32_e32 v29, v32, v29
-; GFX8-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT: v_max_f32_e32 v28, v12, v28
+; GFX8-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], v12, v28
+; GFX8-NEXT: v_add_u32_e64 v12, s[12:13], s68, v12
+; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v15
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT: v_mov_b32_e32 v33, 0x7fc0
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v33, v14, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v33, v12, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e64 v30, v33, v30, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v29, v33, v29, s[10:11]
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_or_b32_e32 v12, v29, v12
+; GFX8-NEXT: v_or_b32_e32 v13, v30, v13
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v31
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT: v_max_f32_e32 v31, v15, v31
+; GFX8-NEXT: v_bfe_u32 v15, v31, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], v15, v31
+; GFX8-NEXT: v_add_u32_e64 v15, s[14:15], s68, v15
+; GFX8-NEXT: v_max_f32_e32 v28, v28, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v31, v31
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v11
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT: v_max_f32_e32 v28, v32, v28
-; GFX8-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT: v_max_f32_e32 v34, v34, v31
+; GFX8-NEXT: v_max_f32_e32 v27, v11, v27
+; GFX8-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], v31, v34
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], v11, v27
+; GFX8-NEXT: v_add_u32_e64 v31, s[16:17], s68, v31
+; GFX8-NEXT: v_add_u32_e64 v11, s[18:19], s68, v11
+; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT: v_max_f32_e32 v27, v32, v27
-; GFX8-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT: v_max_f32_e32 v34, v34, v27
+; GFX8-NEXT: v_max_f32_e32 v26, v10, v26
+; GFX8-NEXT: v_bfe_u32 v27, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], v27, v34
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], v10, v26
+; GFX8-NEXT: v_add_u32_e64 v27, s[20:21], s68, v27
+; GFX8-NEXT: v_add_u32_e64 v10, s[22:23], s68, v10
+; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v9
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT: v_max_f32_e32 v26, v32, v26
-; GFX8-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT: v_max_f32_e32 v34, v34, v26
+; GFX8-NEXT: v_max_f32_e32 v25, v9, v25
+; GFX8-NEXT: v_bfe_u32 v26, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], v26, v34
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], v9, v25
+; GFX8-NEXT: v_add_u32_e64 v26, s[24:25], s68, v26
+; GFX8-NEXT: v_add_u32_e64 v9, s[26:27], s68, v9
+; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v8
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT: v_max_f32_e32 v25, v32, v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX8-NEXT: v_max_f32_e32 v32, v32, v33
-; GFX8-NEXT: v_max_f32_e32 v15, v15, v24
+; GFX8-NEXT: v_max_f32_e32 v34, v34, v25
+; GFX8-NEXT: v_max_f32_e32 v24, v8, v24
+; GFX8-NEXT: v_bfe_u32 v25, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], v25, v34
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], v8, v24
+; GFX8-NEXT: v_add_u32_e64 v25, s[28:29], s68, v25
+; GFX8-NEXT: v_add_u32_e64 v8, s[30:31], s68, v8
+; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v7
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT: v_max_f32_e32 v24, v33, v24
-; GFX8-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT: v_max_f32_e32 v34, v34, v24
+; GFX8-NEXT: v_max_f32_e32 v23, v7, v23
+; GFX8-NEXT: v_bfe_u32 v24, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], v24, v34
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], v7, v23
+; GFX8-NEXT: v_add_u32_e64 v24, s[34:35], s68, v24
+; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], s68, v7
+; GFX8-NEXT: v_cmp_o_f32_e64 s[34:35], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT: v_max_f32_e32 v34, v34, v23
; GFX8-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT: v_bfe_u32 v23, v34, 16, 1
+; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], v23, v34
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], v22, v6
+; GFX8-NEXT: v_add_u32_e64 v23, s[38:39], s68, v23
+; GFX8-NEXT: v_add_u32_e64 v22, s[40:41], s68, v22
+; GFX8-NEXT: v_cmp_o_f32_e64 s[38:39], v34, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v5
+; GFX8-NEXT: v_max_f32_e32 v6, v34, v6
+; GFX8-NEXT: v_bfe_u32 v34, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], v34, v6
+; GFX8-NEXT: v_add_u32_e64 v34, s[42:43], s68, v34
+; GFX8-NEXT: v_cmp_o_f32_e64 s[42:43], v6, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT: v_max_f32_e32 v22, v33, v22
-; GFX8-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v6
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], v6, v5
+; GFX8-NEXT: v_add_u32_e64 v6, s[44:45], s68, v6
+; GFX8-NEXT: v_cmp_o_f32_e64 s[44:45], v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT: v_max_f32_e32 v5, v21, v5
+; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], v21, v5
+; GFX8-NEXT: v_add_u32_e64 v21, s[46:47], s68, v21
+; GFX8-NEXT: v_cmp_o_f32_e64 s[46:47], v5, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v21, v33, v21
-; GFX8-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v5
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], v5, v4
+; GFX8-NEXT: v_add_u32_e64 v5, s[48:49], s68, v5
+; GFX8-NEXT: v_cmp_o_f32_e64 s[48:49], v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT: v_max_f32_e32 v4, v20, v4
+; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], v20, v4
+; GFX8-NEXT: v_add_u32_e64 v20, s[50:51], s68, v20
+; GFX8-NEXT: v_cmp_o_f32_e64 s[50:51], v4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_max_f32_e32 v20, v33, v20
-; GFX8-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], v4, v3
+; GFX8-NEXT: v_add_u32_e64 v4, s[52:53], s68, v4
+; GFX8-NEXT: v_cmp_o_f32_e64 s[52:53], v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT: v_max_f32_e32 v3, v19, v3
+; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], v19, v3
+; GFX8-NEXT: v_add_u32_e64 v19, s[54:55], s68, v19
+; GFX8-NEXT: v_cmp_o_f32_e64 s[54:55], v3, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_max_f32_e32 v19, v33, v19
-; GFX8-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], v3, v2
+; GFX8-NEXT: v_add_u32_e64 v3, s[56:57], s68, v3
+; GFX8-NEXT: v_cmp_o_f32_e64 s[56:57], v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_max_f32_e32 v2, v18, v2
+; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], v18, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[58:59], s68, v18
+; GFX8-NEXT: v_cmp_o_f32_e64 s[58:59], v2, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v18, v33, v18
-; GFX8-NEXT: v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], v2, v1
+; GFX8-NEXT: v_add_u32_e64 v2, s[60:61], s68, v2
+; GFX8-NEXT: v_cmp_o_f32_e64 s[60:61], v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_max_f32_e32 v1, v17, v1
+; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], v17, v1
+; GFX8-NEXT: v_add_u32_e64 v17, s[62:63], s68, v17
+; GFX8-NEXT: v_cmp_o_f32_e64 s[62:63], v1, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX8-NEXT: v_max_f32_e32 v17, v33, v17
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], v1, v0
+; GFX8-NEXT: v_add_u32_e64 v1, s[64:65], s68, v1
+; GFX8-NEXT: v_cmp_o_f32_e64 s[64:65], v0, v0
+; GFX8-NEXT: v_bfe_u32 v0, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], v0, v28
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_add_u32_e64 v0, s[66:67], s68, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cmp_o_f32_e64 s[66:67], v28, v28
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v33, v1, s[64:65]
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v33, v0, s[66:67]
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v33, v10, s[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[26:27]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v33, v8, s[30:31]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[36:37]
+; GFX8-NEXT: v_cndmask_b32_e64 v22, v33, v22, s[40:41]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v33, v6, s[44:45]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[48:49]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v33, v4, s[52:53]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[56:57]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v33, v2, s[60:61]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v33, v17, s[62:63]
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v32, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[16:17]
+; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[20:21]
+; GFX8-NEXT: v_cndmask_b32_e64 v26, v33, v26, s[24:25]
+; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[28:29]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[38:39]
+; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v34, s[42:43]
+; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[46:47]
+; GFX8-NEXT: v_cndmask_b32_e64 v20, v33, v20, s[50:51]
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[54:55]
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v33, v18, s[58:59]
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_or_b32_e32 v1, v18, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v20, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v23, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX8-NEXT: v_or_b32_e32 v8, v25, v8
+; GFX8-NEXT: v_or_b32_e32 v9, v26, v9
+; GFX8-NEXT: v_or_b32_e32 v10, v27, v10
+; GFX8-NEXT: v_or_b32_e32 v11, v31, v11
+; GFX8-NEXT: v_or_b32_e32 v14, v28, v14
+; GFX8-NEXT: v_or_b32_e32 v15, v16, v15
+; GFX8-NEXT: v_readlane_b32 s68, v35, 36
+; GFX8-NEXT: v_readlane_b32 s67, v35, 35
+; GFX8-NEXT: v_readlane_b32 s66, v35, 34
+; GFX8-NEXT: v_readlane_b32 s65, v35, 33
+; GFX8-NEXT: v_readlane_b32 s64, v35, 32
+; GFX8-NEXT: v_readlane_b32 s63, v35, 31
+; GFX8-NEXT: v_readlane_b32 s62, v35, 30
+; GFX8-NEXT: v_readlane_b32 s61, v35, 29
+; GFX8-NEXT: v_readlane_b32 s60, v35, 28
+; GFX8-NEXT: v_readlane_b32 s59, v35, 27
+; GFX8-NEXT: v_readlane_b32 s58, v35, 26
+; GFX8-NEXT: v_readlane_b32 s57, v35, 25
+; GFX8-NEXT: v_readlane_b32 s56, v35, 24
+; GFX8-NEXT: v_readlane_b32 s55, v35, 23
+; GFX8-NEXT: v_readlane_b32 s54, v35, 22
+; GFX8-NEXT: v_readlane_b32 s53, v35, 21
+; GFX8-NEXT: v_readlane_b32 s52, v35, 20
+; GFX8-NEXT: v_readlane_b32 s51, v35, 19
+; GFX8-NEXT: v_readlane_b32 s50, v35, 18
+; GFX8-NEXT: v_readlane_b32 s49, v35, 17
+; GFX8-NEXT: v_readlane_b32 s48, v35, 16
+; GFX8-NEXT: v_readlane_b32 s47, v35, 15
+; GFX8-NEXT: v_readlane_b32 s46, v35, 14
+; GFX8-NEXT: v_readlane_b32 s45, v35, 13
+; GFX8-NEXT: v_readlane_b32 s44, v35, 12
+; GFX8-NEXT: v_readlane_b32 s43, v35, 11
+; GFX8-NEXT: v_readlane_b32 s42, v35, 10
+; GFX8-NEXT: v_readlane_b32 s41, v35, 9
+; GFX8-NEXT: v_readlane_b32 s40, v35, 8
+; GFX8-NEXT: v_readlane_b32 s39, v35, 7
+; GFX8-NEXT: v_readlane_b32 s38, v35, 6
+; GFX8-NEXT: v_readlane_b32 s37, v35, 5
+; GFX8-NEXT: v_readlane_b32 s36, v35, 4
+; GFX8-NEXT: v_readlane_b32 s35, v35, 3
+; GFX8-NEXT: v_readlane_b32 s34, v35, 2
+; GFX8-NEXT: v_readlane_b32 s31, v35, 1
+; GFX8-NEXT: v_readlane_b32 s30, v35, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT: v_writelane_b32 v35, s30, 0
+; GFX9-NEXT: v_writelane_b32 v35, s31, 1
+; GFX9-NEXT: v_writelane_b32 v35, s34, 2
+; GFX9-NEXT: v_writelane_b32 v35, s35, 3
+; GFX9-NEXT: v_writelane_b32 v35, s36, 4
+; GFX9-NEXT: v_writelane_b32 v35, s37, 5
+; GFX9-NEXT: v_writelane_b32 v35, s38, 6
+; GFX9-NEXT: v_writelane_b32 v35, s39, 7
+; GFX9-NEXT: v_writelane_b32 v35, s40, 8
+; GFX9-NEXT: v_writelane_b32 v35, s41, 9
+; GFX9-NEXT: v_writelane_b32 v35, s42, 10
+; GFX9-NEXT: v_writelane_b32 v35, s43, 11
+; GFX9-NEXT: v_writelane_b32 v35, s44, 12
+; GFX9-NEXT: v_writelane_b32 v35, s45, 13
+; GFX9-NEXT: v_writelane_b32 v35, s46, 14
+; GFX9-NEXT: v_writelane_b32 v35, s47, 15
+; GFX9-NEXT: v_writelane_b32 v35, s48, 16
+; GFX9-NEXT: v_writelane_b32 v35, s49, 17
+; GFX9-NEXT: v_writelane_b32 v35, s50, 18
+; GFX9-NEXT: v_writelane_b32 v35, s51, 19
+; GFX9-NEXT: v_writelane_b32 v35, s52, 20
+; GFX9-NEXT: v_writelane_b32 v35, s53, 21
+; GFX9-NEXT: v_writelane_b32 v35, s54, 22
+; GFX9-NEXT: v_writelane_b32 v35, s55, 23
+; GFX9-NEXT: v_writelane_b32 v35, s56, 24
+; GFX9-NEXT: v_writelane_b32 v35, s57, 25
+; GFX9-NEXT: v_writelane_b32 v35, s58, 26
+; GFX9-NEXT: v_writelane_b32 v35, s59, 27
+; GFX9-NEXT: v_writelane_b32 v35, s60, 28
+; GFX9-NEXT: v_writelane_b32 v35, s61, 29
+; GFX9-NEXT: v_writelane_b32 v35, s62, 30
+; GFX9-NEXT: v_writelane_b32 v35, s63, 31
+; GFX9-NEXT: v_writelane_b32 v35, s64, 32
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s65, 33
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v31
+; GFX9-NEXT: v_max_f32_e32 v30, v14, v30
+; GFX9-NEXT: v_writelane_b32 v35, s66, 34
+; GFX9-NEXT: s_movk_i32 s66, 0x7fff
+; GFX9-NEXT: v_bfe_u32 v31, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v14, v30, 16, 1
+; GFX9-NEXT: v_add3_u32 v31, v31, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v32, v32
+; GFX9-NEXT: v_add3_u32 v14, v14, v30, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v30, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_max_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v30
+; GFX9-NEXT: v_max_f32_e32 v29, v13, v29
+; GFX9-NEXT: v_bfe_u32 v30, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v13, v29, 16, 1
+; GFX9-NEXT: v_add3_u32 v30, v30, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v32, v32
+; GFX9-NEXT: v_add3_u32 v13, v13, v29, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v29, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_max_f32_e32 v29, v32, v29
-; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v29
+; GFX9-NEXT: v_max_f32_e32 v28, v12, v28
+; GFX9-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX9-NEXT: v_bfe_u32 v12, v28, 16, 1
+; GFX9-NEXT: v_add3_u32 v29, v29, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v32, v32
+; GFX9-NEXT: v_add3_u32 v12, v12, v28, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v28, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v28
+; GFX9-NEXT: v_bfe_u32 v28, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v28, v28, v32, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v32, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_max_f32_e32 v28, v32, v28
-; GFX9-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT: v_max_f32_e32 v15, v15, v33
+; GFX9-NEXT: v_max_f32_e32 v27, v11, v27
+; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1
+; GFX9-NEXT: v_bfe_u32 v11, v27, 16, 1
+; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v15, v15
+; GFX9-NEXT: v_add3_u32 v15, v33, v15, s66
+; GFX9-NEXT: v_add3_u32 v11, v11, v27, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v27, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_max_f32_e32 v27, v32, v27
-; GFX9-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v27
+; GFX9-NEXT: v_max_f32_e32 v26, v10, v26
+; GFX9-NEXT: v_bfe_u32 v27, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v10, v26, 16, 1
+; GFX9-NEXT: v_add3_u32 v27, v27, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v33, v33
+; GFX9-NEXT: v_add3_u32 v10, v10, v26, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v26, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_max_f32_e32 v26, v32, v26
-; GFX9-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v26
+; GFX9-NEXT: v_max_f32_e32 v25, v9, v25
+; GFX9-NEXT: v_bfe_u32 v26, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v9, v25, 16, 1
+; GFX9-NEXT: v_add3_u32 v26, v26, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v33, v33
+; GFX9-NEXT: v_add3_u32 v9, v9, v25, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v25, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32
-; GFX9-NEXT: v_max_f32_e32 v25, v32, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_max_f32_e32 v32, v32, v33
-; GFX9-NEXT: v_max_f32_e32 v15, v15, v24
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v25
+; GFX9-NEXT: v_max_f32_e32 v24, v8, v24
+; GFX9-NEXT: v_bfe_u32 v25, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v8, v24, 16, 1
+; GFX9-NEXT: v_add3_u32 v25, v25, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v33, v33
+; GFX9-NEXT: v_add3_u32 v8, v8, v24, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v24, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_max_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v24
+; GFX9-NEXT: v_max_f32_e32 v23, v7, v23
+; GFX9-NEXT: v_bfe_u32 v24, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v7, v23, 16, 1
+; GFX9-NEXT: v_add3_u32 v24, v24, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[34:35], v33, v33
+; GFX9-NEXT: v_add3_u32 v7, v7, v23, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[36:37], v23, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_max_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v23
+; GFX9-NEXT: v_max_f32_e32 v22, v6, v22
+; GFX9-NEXT: v_bfe_u32 v23, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v6, v22, 16, 1
+; GFX9-NEXT: v_add3_u32 v23, v23, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[38:39], v33, v33
+; GFX9-NEXT: v_add3_u32 v6, v6, v22, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[40:41], v22, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_max_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v22
+; GFX9-NEXT: v_max_f32_e32 v21, v5, v21
+; GFX9-NEXT: v_bfe_u32 v22, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v5, v21, 16, 1
+; GFX9-NEXT: v_add3_u32 v22, v22, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[42:43], v33, v33
+; GFX9-NEXT: v_add3_u32 v5, v5, v21, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[44:45], v21, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_max_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v21
+; GFX9-NEXT: v_max_f32_e32 v20, v4, v20
+; GFX9-NEXT: v_bfe_u32 v21, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v4, v20, 16, 1
+; GFX9-NEXT: v_add3_u32 v21, v21, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[46:47], v33, v33
+; GFX9-NEXT: v_add3_u32 v4, v4, v20, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[48:49], v20, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_max_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v20
+; GFX9-NEXT: v_max_f32_e32 v19, v3, v19
+; GFX9-NEXT: v_bfe_u32 v20, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v3, v19, 16, 1
+; GFX9-NEXT: v_add3_u32 v20, v20, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[50:51], v33, v33
+; GFX9-NEXT: v_add3_u32 v3, v3, v19, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[52:53], v19, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_max_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v19
+; GFX9-NEXT: v_max_f32_e32 v18, v2, v18
+; GFX9-NEXT: v_bfe_u32 v19, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v2, v18, 16, 1
+; GFX9-NEXT: v_add3_u32 v19, v19, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[54:55], v33, v33
+; GFX9-NEXT: v_add3_u32 v2, v2, v18, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[56:57], v18, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_max_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v18
+; GFX9-NEXT: v_max_f32_e32 v17, v1, v17
+; GFX9-NEXT: v_bfe_u32 v18, v33, 16, 1
+; GFX9-NEXT: v_bfe_u32 v1, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v33, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[58:59], v33, v33
+; GFX9-NEXT: v_add3_u32 v1, v1, v17, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[60:61], v17, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_max_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT: v_max_f32_e32 v16, v0, v16
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v34
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v17
+; GFX9-NEXT: v_bfe_u32 v0, v16, 16, 1
+; GFX9-NEXT: v_bfe_u32 v17, v33, 16, 1
+; GFX9-NEXT: v_add3_u32 v0, v0, v16, s66
+; GFX9-NEXT: v_cmp_o_f32_e64 s[64:65], v16, v16
+; GFX9-NEXT: v_bfe_u32 v16, v32, 16, 1
+; GFX9-NEXT: v_add3_u32 v17, v17, v33, s66
+; GFX9-NEXT: v_add3_u32 v16, v16, v32, s66
+; GFX9-NEXT: v_writelane_b32 v35, s67, 35
+; GFX9-NEXT: v_cmp_o_f32_e64 s[62:63], v33, v33
+; GFX9-NEXT: v_cmp_o_f32_e64 s[66:67], v32, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_mov_b32_e32 v28, 0x7fc0
+; GFX9-NEXT: v_cndmask_b32_e64 v16, v28, v16, s[66:67]
+; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v32, s[16:17]
+; GFX9-NEXT: v_cndmask_b32_e32 v31, v28, v31, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v28, v14, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v30, v28, v30, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v28, v13, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v29, v28, v29, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v28, v15, s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v28, v11, s[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v27, v28, v27, s[20:21]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v10, s[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v26, v28, v26, s[24:25]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v28, v9, s[26:27]
+; GFX9-NEXT: v_cndmask_b32_e64 v25, v28, v25, s[28:29]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v28, v8, s[30:31]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, v28, v24, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v28, v7, s[36:37]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, v28, v23, s[38:39]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v28, v6, s[40:41]
+; GFX9-NEXT: v_cndmask_b32_e64 v22, v28, v22, s[42:43]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v28, v5, s[44:45]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, v28, v21, s[46:47]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v28, v4, s[48:49]
+; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[50:51]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v28, v3, s[52:53]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, v28, v19, s[54:55]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v28, v2, s[56:57]
+; GFX9-NEXT: v_cndmask_b32_e64 v18, v28, v18, s[58:59]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v28, v1, s[60:61]
+; GFX9-NEXT: v_cndmask_b32_e64 v17, v28, v17, s[62:63]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v28, v0, s[64:65]
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
@@ -17053,13 +23673,72 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v32bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT: v_perm_b32 v11, v11, v15, s4
+; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT: v_perm_b32 v15, v32, v16, s4
+; GFX9-NEXT: v_readlane_b32 s67, v35, 35
+; GFX9-NEXT: v_readlane_b32 s66, v35, 34
+; GFX9-NEXT: v_readlane_b32 s65, v35, 33
+; GFX9-NEXT: v_readlane_b32 s64, v35, 32
+; GFX9-NEXT: v_readlane_b32 s63, v35, 31
+; GFX9-NEXT: v_readlane_b32 s62, v35, 30
+; GFX9-NEXT: v_readlane_b32 s61, v35, 29
+; GFX9-NEXT: v_readlane_b32 s60, v35, 28
+; GFX9-NEXT: v_readlane_b32 s59, v35, 27
+; GFX9-NEXT: v_readlane_b32 s58, v35, 26
+; GFX9-NEXT: v_readlane_b32 s57, v35, 25
+; GFX9-NEXT: v_readlane_b32 s56, v35, 24
+; GFX9-NEXT: v_readlane_b32 s55, v35, 23
+; GFX9-NEXT: v_readlane_b32 s54, v35, 22
+; GFX9-NEXT: v_readlane_b32 s53, v35, 21
+; GFX9-NEXT: v_readlane_b32 s52, v35, 20
+; GFX9-NEXT: v_readlane_b32 s51, v35, 19
+; GFX9-NEXT: v_readlane_b32 s50, v35, 18
+; GFX9-NEXT: v_readlane_b32 s49, v35, 17
+; GFX9-NEXT: v_readlane_b32 s48, v35, 16
+; GFX9-NEXT: v_readlane_b32 s47, v35, 15
+; GFX9-NEXT: v_readlane_b32 s46, v35, 14
+; GFX9-NEXT: v_readlane_b32 s45, v35, 13
+; GFX9-NEXT: v_readlane_b32 s44, v35, 12
+; GFX9-NEXT: v_readlane_b32 s43, v35, 11
+; GFX9-NEXT: v_readlane_b32 s42, v35, 10
+; GFX9-NEXT: v_readlane_b32 s41, v35, 9
+; GFX9-NEXT: v_readlane_b32 s40, v35, 8
+; GFX9-NEXT: v_readlane_b32 s39, v35, 7
+; GFX9-NEXT: v_readlane_b32 s38, v35, 6
+; GFX9-NEXT: v_readlane_b32 s37, v35, 5
+; GFX9-NEXT: v_readlane_b32 s36, v35, 4
+; GFX9-NEXT: v_readlane_b32 s35, v35, 3
+; GFX9-NEXT: v_readlane_b32 s34, v35, 2
+; GFX9-NEXT: v_readlane_b32 s31, v35, 1
+; GFX9-NEXT: v_readlane_b32 s30, v35, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
@@ -17072,14 +23751,18 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
@@ -17092,7 +23775,73 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_max_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT: v_max_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_max_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v38, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v48, v10, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_max_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v29, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v30, v12, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v36, v11, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v38, v38, v49, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v48, v10, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v48, v0, 16, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v29, v29, v37, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v37, v19, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v30, v12, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v30, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v39, v18, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v36, v11, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v36, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s30, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s31, v0, v0
+; GFX10-NEXT: v_add3_u32 v17, v49, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v0, v48, v0, 0x7fff
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: v_cmp_o_f32_e64 s5, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s27, v19, v19
+; GFX10-NEXT: v_cmp_o_f32_e64 s28, v18, v18
+; GFX10-NEXT: v_cmp_o_f32_e64 s29, v1, v1
+; GFX10-NEXT: v_cmp_o_f32_e64 s34, v2, v2
+; GFX10-NEXT: v_add3_u32 v19, v37, v19, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v30, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v18, v39, v18, 0x7fff
+; GFX10-NEXT: v_add3_u32 v1, v36, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
@@ -17110,30 +23859,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
-; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s25, v20, v20
+; GFX10-NEXT: v_add3_u32 v20, v33, v20, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7fc0, v0, s31
; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
; GFX10-NEXT: v_max_f32_e32 v25, v54, v53
@@ -17143,142 +23876,423 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10-NEXT: v_max_f32_e32 v23, v66, v65
; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
; GFX10-NEXT: v_max_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_max_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_max_f32_e32 v29, v29, v36
-; GFX10-NEXT: v_max_f32_e32 v28, v28, v38
-; GFX10-NEXT: v_max_f32_e32 v27, v27, v48
-; GFX10-NEXT: v_max_f32_e32 v26, v26, v50
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302
-; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v26, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v28, v13, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v19, 0x7fc0, v19, s27
+; GFX10-NEXT: v_cndmask_b32_e64 v18, 0x7fc0, v18, s28
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7fc0, v1, s29
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7fc0, v2, s34
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX10-NEXT: v_bfe_u32 v50, v51, 16, 1
+; GFX10-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX10-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX10-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX10-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX10-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v26, v14, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v35, v35
+; GFX10-NEXT: v_add3_u32 v27, v27, v35, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v35, v4, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v28, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v28, v3, 16, 1
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX10-NEXT: v_cmp_o_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_cmp_o_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_cmp_o_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_cmp_o_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_cmp_o_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_cmp_o_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_cmp_o_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_cmp_o_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_cmp_o_f32_e64 s22, v5, v5
+; GFX10-NEXT: v_cmp_o_f32_e64 s23, v21, v21
+; GFX10-NEXT: v_cmp_o_f32_e64 s24, v4, v4
+; GFX10-NEXT: v_cmp_o_f32_e64 s26, v3, v3
+; GFX10-NEXT: v_add3_u32 v50, v50, v51, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v52, v9, 0x7fff
+; GFX10-NEXT: v_add3_u32 v25, v53, v25, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v54, v8, 0x7fff
+; GFX10-NEXT: v_add3_u32 v24, v55, v24, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v64, v7, 0x7fff
+; GFX10-NEXT: v_add3_u32 v23, v65, v23, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v66, v6, 0x7fff
+; GFX10-NEXT: v_add3_u32 v22, v67, v22, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v68, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v21, v26, v21, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v35, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v28, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v27
+; GFX10-NEXT: v_cmp_o_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v26, 0x7fc0, v26, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v16, 0x7fc0, v16, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0x7fc0, v13, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v27, 0x7fc0, v27, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0x7fc0, v12, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v28, 0x7fc0, v28, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0x7fc0, v11, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v29, 0x7fc0, v29, s11
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0x7fc0, v10, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v30, 0x7fc0, v30, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0x7fc0, v9, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, 0x7fc0, v25, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0x7fc0, v8, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, 0x7fc0, v24, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0x7fc0, v7, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, 0x7fc0, v23, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0x7fc0, v6, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, 0x7fc0, v22, s21
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7fc0, v5, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v21, 0x7fc0, v21, s23
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0x7fc0, v4, s24
+; GFX10-NEXT: v_cndmask_b32_e64 v20, 0x7fc0, v20, s25
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7fc0, v3, s26
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x5040100
+; GFX10-NEXT: v_perm_b32 v10, v10, v29, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v12, v27, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v13, v26, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX10-NEXT: v_readlane_b32 s34, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
+; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX10-NEXT: v_max_f32_e32 v17, v31, v33
+; GFX10-NEXT: v_max_f32_e32 v15, v15, v32
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_cmp_o_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v15, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7fc0, v17, s4
+; GFX10-NEXT: v_perm_b32 v15, v17, v15, 0x5040100
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX10-NEXT: v_max_f32_e32 v16, v32, v16
-; GFX10-NEXT: v_max_f32_e32 v15, v15, v17
-; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_maxnum_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_b32 v32, off, s32
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX11-NEXT: v_max_f32_e32 v23, v66, v65
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v4, v4, v20
+; GFX11-NEXT: v_max_f32_e32 v21, v70, v69
+; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
+; GFX11-NEXT: v_max_f32_e32 v17, v86, v85
+; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT: v_dual_max_f32 v6, v6, v22 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT: v_max_f32_e32 v22, v68, v67
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT: v_max_f32_e32 v27, v50, v49
+; GFX11-NEXT: v_bfe_u32 v50, v10, 16, 1
; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
-; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT: v_dual_max_f32 v2, v2, v18 :: v_dual_max_f32 v3, v3, v19
-; GFX11-NEXT: v_dual_max_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX11-NEXT: v_max_f32_e32 v26, v52, v51
-; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT: v_add3_u32 v50, v50, v10, 0x7fff
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v18 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT: v_max_f32_e32 v20, v80, v71
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v26, v52, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT: v_dual_max_f32 v18, v84, v83 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT: v_dual_max_f32 v29, v38, v37 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v14, v14, v30 :: v_dual_lshlrev_b32 v31, 16, v15
+; GFX11-NEXT: v_max_f32_e32 v9, v9, v25
; GFX11-NEXT: v_max_f32_e32 v25, v54, v53
-; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
-; GFX11-NEXT: v_max_f32_e32 v24, v64, v55
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX11-NEXT: v_max_f32_e32 v23, v66, v65
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
-; GFX11-NEXT: v_dual_max_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT: v_max_f32_e32 v29, v38, v37
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT: v_max_f32_e32 v37, v86, v85
-; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX11-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT: v_max_f32_e32 v28, v48, v39
; GFX11-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
-; GFX11-NEXT: v_dual_max_f32 v34, v80, v71 :: v_dual_max_f32 v35, v82, v81
-; GFX11-NEXT: v_max_f32_e32 v36, v84, v83
-; GFX11-NEXT: v_dual_max_f32 v16, v32, v16 :: v_dual_max_f32 v15, v15, v17
-; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302
-; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302
-; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
-; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
-; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
-; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v34, v14, 16, 1
+; GFX11-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX11-NEXT: v_bfe_u32 v37, v29, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1
+; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX11-NEXT: v_add3_u32 v34, v34, v14, 0x7fff
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v33, v33
+; GFX11-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT: v_bfe_u32 v38, v12, 16, 1
+; GFX11-NEXT: v_add3_u32 v37, v37, v29, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT: v_bfe_u32 v39, v28, 16, 1
+; GFX11-NEXT: v_add3_u32 v38, v38, v12, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc0, v16, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT: v_bfe_u32 v48, v11, 16, 1
+; GFX11-NEXT: v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT: v_bfe_u32 v49, v27, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc0, v34, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT: v_add3_u32 v48, v48, v11, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT: v_add3_u32 v49, v49, v27, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v51, v26, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v30, 0x7fc0, v35, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT: v_bfe_u32 v52, v9, 16, 1
+; GFX11-NEXT: v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc0, v36, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT: v_bfe_u32 v53, v25, 16, 1
+; GFX11-NEXT: v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT: v_bfe_u32 v54, v8, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v29, 0x7fc0, v37, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT: v_add3_u32 v53, v53, v25, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT: v_bfe_u32 v55, v24, 16, 1
+; GFX11-NEXT: v_add3_u32 v54, v54, v8, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc0, v38, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v28, v28
+; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT: v_bfe_u32 v64, v7, 16, 1
+; GFX11-NEXT: v_add3_u32 v55, v55, v24, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT: v_cndmask_b32_e32 v28, 0x7fc0, v39, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v11
+; GFX11-NEXT: v_bfe_u32 v65, v23, 16, 1
+; GFX11-NEXT: v_add3_u32 v64, v64, v7, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc0, v48, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v27, v27
+; GFX11-NEXT: v_add3_u32 v65, v65, v23, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT: v_bfe_u32 v67, v22, 16, 1
+; GFX11-NEXT: v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v27, 0x7fc0, v49, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT: v_bfe_u32 v68, v5, 16, 1
+; GFX11-NEXT: v_add3_u32 v67, v67, v22, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc0, v50, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v26, v26
+; GFX11-NEXT: v_bfe_u32 v69, v21, 16, 1
+; GFX11-NEXT: v_add3_u32 v68, v68, v5, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT: v_bfe_u32 v70, v4, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v26, 0x7fc0, v51, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v9
+; GFX11-NEXT: v_add3_u32 v69, v69, v21, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT: v_bfe_u32 v71, v20, 16, 1
+; GFX11-NEXT: v_add3_u32 v70, v70, v4, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc0, v52, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v25, v25
+; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT: v_bfe_u32 v80, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v71, v71, v20, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT: v_cndmask_b32_e32 v25, 0x7fc0, v53, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v8
+; GFX11-NEXT: v_bfe_u32 v81, v19, 16, 1
+; GFX11-NEXT: v_add3_u32 v80, v80, v3, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT: v_bfe_u32 v83, v18, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc0, v54, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v24, v24
+; GFX11-NEXT: v_add3_u32 v81, v81, v19, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-NEXT: v_bfe_u32 v84, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v83, v83, v18, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v24, 0x7fc0, v55, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT: v_bfe_u32 v85, v17, 16, 1
+; GFX11-NEXT: v_add3_u32 v84, v84, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v64, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v23, v23
+; GFX11-NEXT: v_bfe_u32 v86, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v85, v85, v17, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT: v_bfe_u32 v82, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e32 v23, 0x7fc0, v65, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v86, v86, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v85
+; GFX11-NEXT: v_add3_u32 v82, v82, v2, 0x7fff
+; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc0, v66, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v22, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x5040100
+; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v22, 0x7fc0, v67, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x5040100
+; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x5040100
+; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x5040100
+; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc0, v68, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v21, v21
+; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7fc0, v69, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v70, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v20, v20
+; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v20, 0x7fc0, v71, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v80, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v19, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v19, 0x7fc0, v81, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v18, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v18, 0x7fc0, v83, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v84, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v17, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v85, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v86, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v82, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x5040100
+; GFX11-NEXT: v_max_f32_e32 v31, v31, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v15, v15, v32
+; GFX11-NEXT: v_bfe_u32 v17, v31, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v18, v15, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v31, v31
+; GFX11-NEXT: v_add3_u32 v17, v17, v31, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc0, v17, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7fc0, v18, vcc_lo
+; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
ret <32 x bfloat> %op
@@ -17359,7 +24373,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x260
; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sqrt_bf16:
@@ -17384,7 +24404,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX9-NEXT: v_mov_b32_e32 v2, 0x260
; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sqrt_bf16:
@@ -17407,7 +24433,11 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sqrt_bf16:
@@ -17437,8 +24467,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.sqrt.bf16(bfloat %a)
ret bfloat %op
@@ -17468,7 +24503,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ldexp_bf16_i32:
@@ -17476,7 +24517,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ldexp_bf16_i32:
@@ -17484,7 +24531,11 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_ldexp_bf16_i32:
@@ -17493,7 +24544,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
ret bfloat %op
@@ -17527,10 +24584,16 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GFX8-LABEL: v_frexp_bf16_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
-; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v1
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_frexp_bf16_i16:
@@ -17538,7 +24601,13 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -17548,7 +24617,11 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
ret { bfloat, i16 } %op
@@ -17638,7 +24711,13 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX8-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_log_bf16:
@@ -17663,7 +24742,13 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log_bf16:
@@ -17682,7 +24767,11 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_log_bf16:
@@ -17706,8 +24795,13 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log.bf16(bfloat %a)
ret bfloat %op
@@ -17758,8 +24852,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_log2_bf16:
@@ -17774,8 +24874,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log2_bf16:
@@ -17788,7 +24894,11 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_log2_bf16:
@@ -17804,7 +24914,13 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log2.bf16(bfloat %a)
ret bfloat %op
@@ -17889,7 +25005,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX8-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_log10_bf16:
@@ -17914,7 +25036,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log10_bf16:
@@ -17933,7 +25061,11 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_log10_bf16:
@@ -17957,8 +25089,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log10.bf16(bfloat %a)
ret bfloat %op
@@ -18045,7 +25182,13 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_exp_bf16:
@@ -18070,7 +25213,13 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp_bf16:
@@ -18090,7 +25239,11 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_exp_bf16:
@@ -18115,8 +25268,13 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.exp.bf16(bfloat %a)
ret bfloat %op
@@ -18167,8 +25325,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX8-NEXT: v_exp_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_exp2_bf16:
@@ -18183,8 +25347,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp2_bf16:
@@ -18197,7 +25367,11 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_exp_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_exp2_bf16:
@@ -18213,7 +25387,13 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.exp2.bf16(bfloat %a)
ret bfloat %op
@@ -18296,7 +25476,13 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_exp10_bf16:
@@ -18321,7 +25507,13 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp10_bf16:
@@ -18341,7 +25533,11 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_exp10_bf16:
@@ -18366,8 +25562,13 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.exp10.bf16(bfloat %a)
ret bfloat %op
@@ -18397,7 +25598,13 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_ceil_f32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ceil_bf16:
@@ -18405,7 +25612,13 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_ceil_f32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ceil_bf16:
@@ -18413,7 +25626,11 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_ceil_f32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_ceil_bf16:
@@ -18422,7 +25639,13 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ceil_f32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.ceil.bf16(bfloat %a)
ret bfloat %op
@@ -18452,7 +25675,13 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_trunc_bf16:
@@ -18460,7 +25689,13 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_trunc_bf16:
@@ -18468,7 +25703,11 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_trunc_bf16:
@@ -18477,7 +25716,13 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.trunc.bf16(bfloat %a)
ret bfloat %op
@@ -18507,7 +25752,13 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_rndne_f32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_rint_bf16:
@@ -18515,7 +25766,13 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_rndne_f32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rint_bf16:
@@ -18523,7 +25780,11 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rint_bf16:
@@ -18532,7 +25793,13 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rndne_f32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.rint.bf16(bfloat %a)
ret bfloat %op
@@ -18562,7 +25829,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_rndne_f32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_nearbyint_bf16:
@@ -18570,7 +25843,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_rndne_f32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_nearbyint_bf16:
@@ -18578,7 +25857,11 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_nearbyint_bf16:
@@ -18587,7 +25870,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rndne_f32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
ret bfloat %op
@@ -18635,7 +25924,13 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX8-NEXT: s_brev_b32 s4, -2
; GFX8-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_round_bf16:
@@ -18649,7 +25944,13 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_round_bf16:
@@ -18662,7 +25963,11 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_round_bf16:
@@ -18678,8 +25983,13 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
; GFX11-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.round.bf16(bfloat %a)
ret bfloat %op
@@ -18709,7 +26019,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_rndne_f32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_roundeven_bf16:
@@ -18717,7 +26033,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_rndne_f32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_bf16:
@@ -18725,7 +26047,11 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_roundeven_bf16:
@@ -18734,7 +26060,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rndne_f32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.roundeven.bf16(bfloat %a)
ret bfloat %op
@@ -18764,7 +26096,13 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_floor_f32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_floor_bf16:
@@ -18772,7 +26110,13 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_floor_f32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_floor_bf16:
@@ -18780,7 +26124,11 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_floor_f32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_floor_bf16:
@@ -18789,7 +26137,13 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_floor_f32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.floor.bf16(bfloat %a)
ret bfloat %op
@@ -18813,7 +26167,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_canonicalize_bf16:
@@ -18821,7 +26181,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_canonicalize_bf16:
@@ -18829,7 +26195,11 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_canonicalize_bf16:
@@ -18838,7 +26208,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
ret bfloat %op
@@ -22198,21 +29574,37 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_i16_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i16_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_i16_to_bf16:
@@ -22221,7 +29613,13 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp i16 %x to bfloat
ret bfloat %op
@@ -22255,37 +29653,86 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0
-; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <2 x i16> %x to <2 x bfloat>
ret <2 x bfloat> %op
@@ -22325,32 +29772,116 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <3 x i16> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -22394,54 +29925,147 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v0, v3, v0, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add3_u32 v5, v5, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x7060302
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v1
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 16, v0
-; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GFX11-NEXT: v_bfe_i32 v3, v0, 0, 16
+; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <4 x i16> %x to <4 x bfloat>
ret <4 x bfloat> %op
@@ -22466,29 +30090,50 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_i32_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i32_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_i32_to_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp i32 %x to bfloat
ret bfloat %op
@@ -22516,10 +30161,23 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
@@ -22527,7 +30185,19 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -22536,7 +30206,17 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
@@ -22544,8 +30224,22 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <2 x i32> %x to <2 x bfloat>
ret <2 x bfloat> %op
@@ -22578,22 +30272,58 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v1
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -22602,9 +30332,53 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <3 x i32> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -22639,24 +30413,72 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -22668,8 +30490,28 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
@@ -22679,9 +30521,34 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <4 x i32> %x to <4 x bfloat>
ret <4 x bfloat> %op
@@ -22738,8 +30605,14 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_i64_to_bf16:
@@ -22752,12 +30625,18 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX9-NEXT: v_add_u32_e32 v3, -1, v3
; GFX9-NEXT: v_min_u32_e32 v2, v3, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i64_to_bf16:
@@ -22775,7 +30654,11 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_i64_to_bf16:
@@ -22798,7 +30681,13 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp i64 %x to bfloat
ret bfloat %op
@@ -22877,25 +30766,39 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
; GFX8-NEXT: v_min_u32_e32 v4, v4, v5
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
+; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
+; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
; GFX8-NEXT: v_min_u32_e32 v6, v0, v1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_ldexp_f32 v1, v5, v2
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
@@ -22908,24 +30811,36 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
; GFX9-NEXT: v_min_u32_e32 v4, v4, v5
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
+; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
+; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX9-NEXT: v_add3_u32 v0, v0, v4, s4
; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
; GFX9-NEXT: v_min_u32_e32 v6, v0, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v5
+; GFX9-NEXT: v_sub_u32_e32 v3, 32, v6
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -22956,7 +30871,17 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
@@ -22995,8 +30920,22 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <2 x i64> %x to <2 x bfloat>
ret <2 x bfloat> %op
@@ -23101,38 +31040,57 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7
; GFX8-NEXT: v_min_u32_e32 v6, v6, v7
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT: v_xor_b32_e32 v7, v0, v1
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT: v_ldexp_f32 v6, v4, v5
-; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT: v_ffbh_i32_e32 v4, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT: v_min_u32_e32 v7, v4, v5
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX8-NEXT: v_min_u32_e32 v0, 1, v4
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: v_xor_b32_e32 v5, v2, v3
-; GFX8-NEXT: v_ffbh_i32_e32 v4, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT: v_min_u32_e32 v4, v4, v5
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT: v_ffbh_i32_e32 v6, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x7fc0
+; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v6
+; GFX8-NEXT: v_xor_b32_e32 v6, v2, v3
+; GFX8-NEXT: v_ffbh_i32_e32 v5, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6
+; GFX8-NEXT: v_min_u32_e32 v5, v5, v6
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -23148,79 +31106,182 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
; GFX9-NEXT: v_ffbh_i32_e32 v6, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT: v_or_b32_e32 v5, v1, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT: v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v5
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc0
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_sub_u32_e32 v4, 32, v6
+; GFX9-NEXT: v_xor_b32_e32 v6, v2, v3
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT: v_ffbh_i32_e32 v5, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX9-NEXT: v_add_u32_e32 v5, -1, v5
+; GFX9-NEXT: v_add_u32_e32 v6, 32, v6
+; GFX9-NEXT: v_min_u32_e32 v5, v5, v6
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
+; GFX9-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v4
+; GFX9-NEXT: v_sub_u32_e32 v3, 32, v5
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v7, v0, v1
-; GFX10-NEXT: v_xor_b32_e32 v8, v4, v5
-; GFX10-NEXT: v_xor_b32_e32 v9, v2, v3
-; GFX10-NEXT: v_ffbh_i32_e32 v6, v5
+; GFX10-NEXT: v_xor_b32_e32 v6, v0, v1
+; GFX10-NEXT: v_xor_b32_e32 v8, v2, v3
; GFX10-NEXT: v_ffbh_i32_e32 v10, v1
-; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX10-NEXT: v_xor_b32_e32 v9, v4, v5
; GFX10-NEXT: v_ffbh_i32_e32 v11, v3
-; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v6
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6
; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 32, v7
+; GFX10-NEXT: v_ffbh_i32_e32 v7, v5
+; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX10-NEXT: v_add_nc_u32_e32 v6, 32, v6
; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT: v_min_u32_e32 v7, v10, v7
-; GFX10-NEXT: v_min_u32_e32 v9, v11, v9
-; GFX10-NEXT: v_min_u32_e32 v6, v6, v8
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v7, -1, v7
+; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
+; GFX10-NEXT: v_min_u32_e32 v6, v10, v6
+; GFX10-NEXT: v_min_u32_e32 v8, v11, v8
+; GFX10-NEXT: v_min_u32_e32 v7, v7, v9
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX10-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT: v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v7
-; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT: v_min_u32_e32 v2, 1, v4
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v6
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v8
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT: v_or_b32_e32 v2, v5, v2
; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v7
; GFX10-NEXT: v_ldexp_f32 v1, v1, v4
-; GFX10-NEXT: v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX10-NEXT: v_add3_u32 v3, v4, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v5, v1, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_xor_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_xor_b32_e32 v8, v2, v3
+; GFX11-NEXT: v_cls_i32_e32 v10, v1
+; GFX11-NEXT: v_cls_i32_e32 v11, v3
+; GFX11-NEXT: v_xor_b32_e32 v9, v4, v5
+; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v11, -1, v11
+; GFX11-NEXT: v_cls_i32_e32 v7, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, -1, v7
+; GFX11-NEXT: v_min_u32_e32 v6, v10, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_u32_e32 v8, v11, v8
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v6
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT: v_min_u32_e32 v7, v7, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v7
+; GFX11-NEXT: v_min_u32_e32 v2, 1, v4
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v4
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX11-NEXT: v_add3_u32 v3, v4, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v4, v5, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <3 x i64> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -23350,51 +31411,77 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
+; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_xor_b32_e32 v5, v6, v7
-; GFX8-NEXT: v_cvt_f32_i32_e32 v9, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GFX8-NEXT: v_ffbh_i32_e32 v4, v7
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
; GFX8-NEXT: v_min_u32_e32 v10, v4, v5
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1
+; GFX8-NEXT: v_ffbh_i32_e32 v8, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT: v_ldexp_f32 v5, v9, v6
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
-; GFX8-NEXT: v_xor_b32_e32 v7, v0, v1
-; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
-; GFX8-NEXT: v_ffbh_i32_e32 v6, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7
-; GFX8-NEXT: v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
+; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 32, v10
+; GFX8-NEXT: v_ldexp_f32 v4, v4, v7
+; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT: v_ldexp_f32 v7, v0, v1
+; GFX8-NEXT: v_bfe_u32 v0, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v7
; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_cvt_f32_i32_e32 v7, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v0
; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT: v_min_u32_e32 v8, v0, v1
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT: v_min_u32_e32 v9, v0, v1
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v8
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_ldexp_f32 v1, v7, v2
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v9
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v5, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -23407,50 +31494,72 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
; GFX9-NEXT: v_min_u32_e32 v8, v8, v9
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT: v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
+; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
+; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v8, s4
; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7
-; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GFX9-NEXT: v_ffbh_i32_e32 v4, v7
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
; GFX9-NEXT: v_min_u32_e32 v10, v4, v5
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT: v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT: v_ffbh_i32_e32 v7, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT: v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT: v_xor_b32_e32 v9, v0, v1
+; GFX9-NEXT: v_ffbh_i32_e32 v8, v1
+; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX9-NEXT: v_add_u32_e32 v8, -1, v8
+; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT: v_min_u32_e32 v8, v8, v9
; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT: v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT: v_sub_u32_e32 v5, 32, v10
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v7, 32, v10
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT: v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_ldexp_f32 v4, v4, v7
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX9-NEXT: v_add3_u32 v7, v7, v4, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT: v_ldexp_f32 v7, v0, v1
+; GFX9-NEXT: v_bfe_u32 v0, v7, 16, 1
; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT: v_add3_u32 v8, v0, v7, s4
; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT: v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v5
+; GFX9-NEXT: v_min_u32_e32 v9, v0, v1
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_perm_b32 v1, v4, v6, s4
+; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -23458,18 +31567,18 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v8, v4, v5
; GFX10-NEXT: v_ffbh_i32_e32 v9, v5
-; GFX10-NEXT: v_xor_b32_e32 v11, v6, v7
; GFX10-NEXT: v_xor_b32_e32 v13, v0, v1
-; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3
+; GFX10-NEXT: v_xor_b32_e32 v11, v6, v7
+; GFX10-NEXT: v_ffbh_i32_e32 v12, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9
+; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3
; GFX10-NEXT: v_ffbh_i32_e32 v10, v7
-; GFX10-NEXT: v_ffbh_i32_e32 v12, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v11
; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v12
; GFX10-NEXT: v_ashrrev_i32_e32 v14, 31, v14
; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
-; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v12
; GFX10-NEXT: v_add_nc_u32_e32 v11, 32, v11
; GFX10-NEXT: v_min_u32_e32 v8, v9, v8
; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v13
@@ -23483,29 +31592,49 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX10-NEXT: v_min_u32_e32 v9, v12, v9
; GFX10-NEXT: v_min_u32_e32 v11, v13, v14
; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT: v_min_u32_e32 v6, 1, v6
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX10-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11
+; GFX10-NEXT: v_min_u32_e32 v5, 1, v6
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v8
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT: v_sub_nc_u32_e32 v7, 32, v10
+; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v4
-; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8
-; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v5
+; GFX10-NEXT: v_ldexp_f32 v3, v4, v6
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT: v_or_b32_e32 v1, v7, v5
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9
; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT: v_ldexp_f32 v3, v4, v7
-; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
-; GFX10-NEXT: v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v11
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX10-NEXT: v_ldexp_f32 v2, v2, v5
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -23513,18 +31642,18 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v8, v4, v5
; GFX11-NEXT: v_cls_i32_e32 v9, v5
-; GFX11-NEXT: v_xor_b32_e32 v11, v6, v7
; GFX11-NEXT: v_xor_b32_e32 v13, v0, v1
-; GFX11-NEXT: v_xor_b32_e32 v14, v2, v3
+; GFX11-NEXT: v_xor_b32_e32 v11, v6, v7
+; GFX11-NEXT: v_cls_i32_e32 v12, v1
; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX11-NEXT: v_add_nc_u32_e32 v9, -1, v9
+; GFX11-NEXT: v_xor_b32_e32 v14, v2, v3
; GFX11-NEXT: v_cls_i32_e32 v10, v7
-; GFX11-NEXT: v_cls_i32_e32 v12, v1
; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11
; GFX11-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v14
; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11
; GFX11-NEXT: v_min_u32_e32 v8, v9, v8
; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v13
@@ -23537,37 +31666,61 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7]
; GFX11-NEXT: v_min_u32_e32 v9, v12, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_min_u32_e32 v11, v13, v14
; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT: v_min_u32_e32 v6, 1, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v11
+; GFX11-NEXT: v_min_u32_e32 v5, 1, v6
+; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v8
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT: v_sub_nc_u32_e32 v7, 32, v10
+; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v4
-; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v8
-; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_ldexp_f32 v3, v4, v6
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v9
+; GFX11-NEXT: v_or_b32_e32 v1, v7, v5
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v9
; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
-; GFX11-NEXT: v_ldexp_f32 v3, v4, v7
+; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v11
+; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v10
+; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-NEXT: v_ldexp_f32 v2, v2, v5
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v6
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, v5
-; GFX11-NEXT: v_ldexp_f32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <4 x i64> %x to <4 x bfloat>
ret <4 x bfloat> %op
@@ -23594,21 +31747,37 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i16_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i16_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_i16_to_bf16:
@@ -23617,7 +31786,13 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp i16 %x to bfloat
ret bfloat %op
@@ -23649,10 +31824,23 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -23660,7 +31848,19 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -23669,7 +31869,17 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -23680,8 +31890,22 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <2 x i16> %x to <2 x bfloat>
ret <2 x bfloat> %op
@@ -23719,23 +31943,58 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -23744,9 +32003,58 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <3 x i16> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -23789,24 +32097,72 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v0, v3, v0, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_add3_u32 v5, v5, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -23818,8 +32174,28 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -23835,9 +32211,34 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <4 x i16> %x to <4 x bfloat>
ret <4 x bfloat> %op
@@ -23862,29 +32263,50 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i32_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i32_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_i32_to_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp i32 %x to bfloat
ret bfloat %op
@@ -23912,10 +32334,23 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -23923,7 +32358,19 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -23932,7 +32379,17 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -23940,8 +32397,22 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <2 x i32> %x to <2 x bfloat>
ret <2 x bfloat> %op
@@ -23974,22 +32445,58 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -23998,9 +32505,53 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <3 x i32> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -24035,24 +32586,72 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -24064,8 +32663,28 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -24075,9 +32694,34 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <4 x i32> %x to <4 x bfloat>
ret <4 x bfloat> %op
@@ -24122,8 +32766,14 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i64_to_bf16:
@@ -24132,12 +32782,18 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GFX9-NEXT: v_ffbh_u32_e32 v2, v1
; GFX9-NEXT: v_min_u32_e32 v2, 32, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v2, 32, v2
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc0
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i64_to_bf16:
@@ -24151,7 +32807,11 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_i64_to_bf16:
@@ -24168,8 +32828,13 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp i64 %x to bfloat
ret bfloat %op
@@ -24230,19 +32895,32 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
+; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
+; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
; GFX8-NEXT: v_min_u32_e32 v6, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_ldexp_f32 v1, v5, v2
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
@@ -24251,20 +32929,32 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX9-NEXT: v_ffbh_u32_e32 v4, v1
; GFX9-NEXT: v_min_u32_e32 v4, 32, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
+; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
+; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX9-NEXT: v_add3_u32 v0, v0, v4, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
; GFX9-NEXT: v_min_u32_e32 v6, 32, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v5
+; GFX9-NEXT: v_sub_u32_e32 v3, 32, v6
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -24287,7 +32977,17 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
@@ -24315,8 +33015,22 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <2 x i64> %x to <2 x bfloat>
ret <2 x bfloat> %op
@@ -24393,30 +33107,49 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX8-NEXT: v_ffbh_u32_e32 v6, v5
; GFX8-NEXT: v_min_u32_e32 v6, 32, v6
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT: v_ldexp_f32 v6, v4, v5
-; GFX8-NEXT: v_ffbh_u32_e32 v4, v1
-; GFX8-NEXT: v_min_u32_e32 v7, 32, v4
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX8-NEXT: v_min_u32_e32 v0, 1, v4
-; GFX8-NEXT: v_ffbh_u32_e32 v4, v3
-; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_ffbh_u32_e32 v6, v1
+; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
+; GFX8-NEXT: v_min_u32_e32 v6, 32, v6
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x7fc0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT: v_ffbh_u32_e32 v5, v3
+; GFX8-NEXT: v_min_u32_e32 v5, 32, v5
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v6
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -24425,30 +33158,46 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX9-NEXT: v_ffbh_u32_e32 v6, v5
; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
; GFX9-NEXT: v_ffbh_u32_e32 v6, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT: v_or_b32_e32 v5, v1, v0
-; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT: v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v5
+; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc0
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT: v_ffbh_u32_e32 v5, v3
+; GFX9-NEXT: v_min_u32_e32 v5, 32, v5
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
+; GFX9-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT: v_sub_u32_e32 v4, 32, v6
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v4
+; GFX9-NEXT: v_sub_u32_e32 v3, 32, v5
+; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX9-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -24463,24 +33212,96 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT: v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT: v_min_u32_e32 v2, 1, v4
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v6
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v7
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT: v_or_b32_e32 v2, v5, v2
; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8
; GFX10-NEXT: v_ldexp_f32 v1, v1, v4
-; GFX10-NEXT: v_ldexp_f32 v2, v2, v8
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX10-NEXT: v_add3_u32 v3, v4, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v5, v1, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_clz_i32_u32_e32 v6, v1
+; GFX11-NEXT: v_clz_i32_u32_e32 v7, v3
+; GFX11-NEXT: v_clz_i32_u32_e32 v8, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_u32_e32 v6, 32, v6
+; GFX11-NEXT: v_min_u32_e32 v7, 32, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_u32_e32 v8, 32, v8
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT: v_min_u32_e32 v2, 1, v4
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v6
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v7
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v8
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v3, v4, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v5, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <3 x i64> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -24574,39 +33395,65 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_ffbh_u32_e32 v8, v5
; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT: v_cvt_f32_u32_e32 v9, v4
+; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
+; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GFX8-NEXT: v_ffbh_u32_e32 v4, v7
; GFX8-NEXT: v_min_u32_e32 v10, 32, v4
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX8-NEXT: v_ldexp_f32 v5, v9, v6
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
-; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
-; GFX8-NEXT: v_ffbh_u32_e32 v6, v1
-; GFX8-NEXT: v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_ffbh_u32_e32 v8, v1
+; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 32, v10
+; GFX8-NEXT: v_ldexp_f32 v4, v4, v7
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f32_u32_e32 v7, v0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT: v_ldexp_f32 v7, v0, v1
+; GFX8-NEXT: v_bfe_u32 v0, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v7
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v0
; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT: v_min_u32_e32 v8, 32, v0
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT: v_min_u32_e32 v9, 32, v0
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v8
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_ldexp_f32 v1, v7, v2
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v9
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v5, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
@@ -24615,86 +33462,128 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX9-NEXT: v_ffbh_u32_e32 v8, v5
; GFX9-NEXT: v_min_u32_e32 v8, 32, v8
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT: v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v9, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
+; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
+; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX9-NEXT: v_add3_u32 v4, v4, v8, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GFX9-NEXT: v_ffbh_u32_e32 v4, v7
; GFX9-NEXT: v_min_u32_e32 v10, 32, v4
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT: v_ffbh_u32_e32 v7, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT: v_ffbh_u32_e32 v8, v1
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT: v_min_u32_e32 v8, 32, v8
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT: v_min_u32_e32 v7, 32, v7
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT: v_sub_u32_e32 v5, 32, v10
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v7, 32, v10
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT: v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_ldexp_f32 v4, v4, v7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT: v_add3_u32 v7, v7, v4, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT: v_ldexp_f32 v7, v0, v1
+; GFX9-NEXT: v_bfe_u32 v0, v7, 16, 1
+; GFX9-NEXT: v_add3_u32 v8, v0, v7, s4
; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT: v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT: v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v5
+; GFX9-NEXT: v_min_u32_e32 v9, 32, v0
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3]
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_perm_b32 v1, v4, v6, s4
+; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v8, v5
-; GFX10-NEXT: v_ffbh_u32_e32 v9, v1
-; GFX10-NEXT: v_ffbh_u32_e32 v10, v3
-; GFX10-NEXT: v_ffbh_u32_e32 v11, v7
+; GFX10-NEXT: v_ffbh_u32_e32 v9, v7
+; GFX10-NEXT: v_ffbh_u32_e32 v10, v1
+; GFX10-NEXT: v_ffbh_u32_e32 v11, v3
; GFX10-NEXT: v_min_u32_e32 v8, 32, v8
; GFX10-NEXT: v_min_u32_e32 v9, 32, v9
; GFX10-NEXT: v_min_u32_e32 v10, 32, v10
; GFX10-NEXT: v_min_u32_e32 v11, 32, v11
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8
; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT: v_min_u32_e32 v6, 1, v6
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT: v_min_u32_e32 v6, 1, v6
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT: v_or_b32_e32 v5, v7, v6
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT: v_or_b32_e32 v3, v7, v6
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v11
-; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v9
+; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v4
+; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v5
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9
+; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v10
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11
+; GFX10-NEXT: v_ldexp_f32 v3, v3, v8
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v10
-; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT: v_ldexp_f32 v4, v4, v8
+; GFX10-NEXT: v_ldexp_f32 v1, v4, v1
; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
+; GFX10-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX10-NEXT: v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT: v_ldexp_f32 v1, v3, v1
-; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_clz_i32_u32_e32 v8, v5
-; GFX11-NEXT: v_clz_i32_u32_e32 v9, v1
-; GFX11-NEXT: v_clz_i32_u32_e32 v10, v3
-; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7
+; GFX11-NEXT: v_clz_i32_u32_e32 v9, v7
+; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1
+; GFX11-NEXT: v_clz_i32_u32_e32 v11, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_min_u32_e32 v8, 32, v8
; GFX11-NEXT: v_min_u32_e32 v9, 32, v9
@@ -24703,36 +33592,61 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX11-NEXT: v_min_u32_e32 v11, 32, v11
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX11-NEXT: v_sub_nc_u32_e32 v8, 32, v8
; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT: v_min_u32_e32 v6, 1, v6
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT: v_min_u32_e32 v6, 1, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_or_b32_e32 v5, v7, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT: v_or_b32_e32 v3, v7, v6
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v11
-; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v9
+; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v5
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v9
+; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v10
+; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v11
+; GFX11-NEXT: v_ldexp_f32 v3, v3, v8
; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v10
-; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT: v_ldexp_f32 v4, v4, v8
+; GFX11-NEXT: v_ldexp_f32 v1, v4, v1
; GFX11-NEXT: v_ldexp_f32 v0, v0, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11-NEXT: v_ldexp_f32 v2, v2, v6
-; GFX11-NEXT: v_ldexp_f32 v1, v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <4 x i64> %x to <4 x bfloat>
ret <4 x bfloat> %op
@@ -29207,7 +38121,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_bf16:
@@ -29217,7 +38137,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_bf16:
@@ -29227,7 +38153,11 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fma_bf16:
@@ -29238,7 +38168,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
ret bfloat %op
@@ -29281,13 +38217,26 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v2bf16:
@@ -29296,12 +38245,24 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3
+; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -29316,7 +38277,17 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_perm_b32 v0, v2, v3, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fma_v2bf16:
@@ -29330,7 +38301,21 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11-NEXT: v_perm_b32 v0, v2, v3, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
@@ -29384,17 +38369,36 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_fma_f32 v3, v7, v6, v3
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v3bf16:
@@ -29404,37 +38408,106 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT: v_fma_f32 v3, v7, v6, v3
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3
+; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_fmac_f32_e32 v5, v0, v3
+; GFX10-NEXT: v_add3_u32 v0, v1, v6, 0x7fff
+; GFX10-NEXT: v_add3_u32 v1, v2, v4, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fma_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v5, v0, v3
+; GFX11-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX11-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v0, v1, v6, 0x7fff
+; GFX11-NEXT: v_add3_u32 v1, v2, v4, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_add3_u32 v2, v2, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
ret <3 x bfloat> %op
}
@@ -29496,23 +38569,49 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v4bf16:
@@ -29521,20 +38620,42 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_mov_b32_e32 v8, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -29545,45 +38666,94 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT: v_fmac_f32_e32 v9, v11, v10
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_perm_b32 v0, v4, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v5, v6, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v0, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v8, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v0, v0, v7, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v5, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fma_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_fmac_f32 v9, v11, v10 :: v_dual_lshlrev_b32 v6, 16, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-NEXT: v_dual_fmac_f32 v4, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v4, v9, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX11-NEXT: v_perm_b32 v1, v5, v6, 0x7060302
+; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v3, v3, v5, 0x7fff
+; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v8
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v0, v7, 16, 1
+; GFX11-NEXT: v_add3_u32 v1, v8, v6, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v0, v0, v7, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
@@ -29625,10 +38795,22 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_bf16:
@@ -29637,10 +38819,21 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_bf16:
@@ -29649,10 +38842,19 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmuladd_bf16:
@@ -29661,11 +38863,24 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0x7fc0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
ret bfloat %op
@@ -29715,18 +38930,44 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v2bf16:
@@ -29734,17 +38975,39 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -29757,13 +39020,33 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmuladd_v2bf16:
@@ -29772,18 +39055,45 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0x7fc0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
@@ -29848,24 +39158,61 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v3bf16:
@@ -29874,50 +39221,175 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v7, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_add3_u32 v3, v4, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v4, v5, v0, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmuladd_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v3, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0x7fc0, v3 :: v_dual_lshlrev_b32 v3, 16, v4
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v3, 16, v5
+; GFX11-NEXT: v_dual_cndmask_b32 v1, 0x7fc0, v6 :: v_dual_add_f32 v0, v0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT: v_add3_u32 v3, v4, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v4, v5, v0, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
ret <3 x bfloat> %op
}
@@ -29994,32 +39466,82 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v4bf16:
@@ -30027,29 +39549,71 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_mov_b32_e32 v8, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX9-NEXT: v_add_f32_e32 v6, v6, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -30059,64 +39623,152 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_mul_f32_e32 v7, v9, v8
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_mul_f32_e32 v3, v8, v7
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_bfe_u32 v2, v7, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT: v_add3_u32 v3, v8, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v7, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_add_f32_e32 v2, v6, v2
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_add3_u32 v5, v6, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fmuladd_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v2, v6, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX11-NEXT: v_mul_f32_e32 v3, v8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT: v_mul_f32_e32 v7, v9, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v2, v7, 16, 1
+; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v3, v8, v6, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-NEXT: v_add3_u32 v2, v2, v7, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-NEXT: v_dual_cndmask_b32 v2, 0x7fc0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v10
+; GFX11-NEXT: v_dual_cndmask_b32 v1, 0x7fc0, v6 :: v_dual_lshlrev_b32 v6, 16, v4
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0x7fc0, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v5, v6, v3, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v6, vcc_lo
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index a69fb35f8f0cb0..63fa5c5ac18020 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -787,7 +787,13 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
@@ -796,8 +802,14 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc0
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
%arg0.ext = fpext half %arg0 to float
%arg1.ext = fpext half %arg1 to float
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 490167ee3635a3..73655161ce86cf 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -1504,106 +1504,127 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: global_atomic_fadd_ret_bf16_agent:
; GFX900: ; %bb.0:
-; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX900-NEXT: s_mov_b64 s[0:1], 0
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_mov_b32_e32 v0, 0x7fc0
+; GFX900-NEXT: v_mov_b32_e32 v1, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_and_b32 s2, s4, -4
-; GFX900-NEXT: s_mov_b32 s3, s5
-; GFX900-NEXT: s_load_dword s6, s[2:3], 0x0
-; GFX900-NEXT: s_and_b32 s4, s4, 3
-; GFX900-NEXT: s_lshl_b32 s4, s4, 3
-; GFX900-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX900-NEXT: s_not_b32 s5, s5
+; GFX900-NEXT: s_and_b32 s2, s6, -4
+; GFX900-NEXT: s_mov_b32 s3, s7
+; GFX900-NEXT: s_load_dword s7, s[2:3], 0x0
+; GFX900-NEXT: s_and_b32 s5, s6, 3
+; GFX900-NEXT: s_lshl_b32 s5, s5, 3
+; GFX900-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX900-NEXT: s_not_b32 s6, s6
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v1, s6
+; GFX900-NEXT: v_mov_b32_e32 v2, s7
; GFX900-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_mov_b32_e32 v2, v1
-; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX900-NEXT: v_and_or_b32 v1, v2, s5, v1
-; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX900-NEXT: v_mov_b32_e32 v3, v2
+; GFX900-NEXT: v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX900-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, s5, v2
+; GFX900-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX900-NEXT: global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
-; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX900-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX900-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX900-NEXT: s_cbranch_execnz .LBB10_1
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX900-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, s5, v2
; GFX900-NEXT: global_store_short v[0:1], v0, off
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_ret_bf16_agent:
; GFX908: ; %bb.0:
-; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX908-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX908-NEXT: s_mov_b64 s[0:1], 0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_movk_i32 s4, 0x7fff
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x7fc0
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_and_b32 s2, s4, -4
-; GFX908-NEXT: s_mov_b32 s3, s5
-; GFX908-NEXT: s_load_dword s6, s[2:3], 0x0
-; GFX908-NEXT: s_and_b32 s4, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, s4, 3
-; GFX908-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX908-NEXT: s_not_b32 s5, s5
+; GFX908-NEXT: s_and_b32 s2, s6, -4
+; GFX908-NEXT: s_mov_b32 s3, s7
+; GFX908-NEXT: s_load_dword s7, s[2:3], 0x0
+; GFX908-NEXT: s_and_b32 s5, s6, 3
+; GFX908-NEXT: s_lshl_b32 s5, s5, 3
+; GFX908-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX908-NEXT: s_not_b32 s6, s6
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
+; GFX908-NEXT: v_mov_b32_e32 v2, s7
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s5, v1
-; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX908-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX908-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX908-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, s5, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s5, v2
; GFX908-NEXT: global_store_short v[0:1], v0, off
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: global_atomic_fadd_ret_bf16_agent:
; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_movk_i32 s4, 0x7fff
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x7fc0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_and_b32 s2, s4, -4
-; GFX90A-NEXT: s_mov_b32 s3, s5
-; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x0
-; GFX90A-NEXT: s_and_b32 s4, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX90A-NEXT: s_not_b32 s5, s5
+; GFX90A-NEXT: s_and_b32 s2, s6, -4
+; GFX90A-NEXT: s_mov_b32 s3, s7
+; GFX90A-NEXT: s_load_dword s7, s[2:3], 0x0
+; GFX90A-NEXT: s_and_b32 s5, s6, 3
+; GFX90A-NEXT: s_lshl_b32 s5, s5, 3
+; GFX90A-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX90A-NEXT: s_not_b32 s6, s6
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
+; GFX90A-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s5, v1
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX90A-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX90A-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s5, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s5, v2
; GFX90A-NEXT: global_store_short v[0:1], v0, off
; GFX90A-NEXT: s_endpgm
;
@@ -1627,7 +1648,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, s2, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, s4, v1
; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1658,13 +1684,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v1, s2, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1
; GFX11-NEXT: v_and_or_b32 v1, v2, s4, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
@@ -1690,108 +1721,129 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: global_atomic_fadd_ret_bf16_system:
; GFX900: ; %bb.0:
-; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX900-NEXT: s_mov_b64 s[0:1], 0
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_mov_b32_e32 v0, 0x7fc0
+; GFX900-NEXT: v_mov_b32_e32 v1, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_and_b32 s2, s4, -4
-; GFX900-NEXT: s_mov_b32 s3, s5
-; GFX900-NEXT: s_load_dword s6, s[2:3], 0x0
-; GFX900-NEXT: s_and_b32 s4, s4, 3
-; GFX900-NEXT: s_lshl_b32 s4, s4, 3
-; GFX900-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX900-NEXT: s_not_b32 s5, s5
+; GFX900-NEXT: s_and_b32 s2, s6, -4
+; GFX900-NEXT: s_mov_b32 s3, s7
+; GFX900-NEXT: s_load_dword s7, s[2:3], 0x0
+; GFX900-NEXT: s_and_b32 s5, s6, 3
+; GFX900-NEXT: s_lshl_b32 s5, s5, 3
+; GFX900-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX900-NEXT: s_not_b32 s6, s6
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v1, s6
+; GFX900-NEXT: v_mov_b32_e32 v2, s7
; GFX900-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_mov_b32_e32 v2, v1
-; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX900-NEXT: v_and_or_b32 v1, v2, s5, v1
-; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX900-NEXT: v_mov_b32_e32 v3, v2
+; GFX900-NEXT: v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX900-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, s5, v2
+; GFX900-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX900-NEXT: global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
-; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX900-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX900-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX900-NEXT: s_cbranch_execnz .LBB11_1
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX900-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, s5, v2
; GFX900-NEXT: global_store_short v[0:1], v0, off
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_ret_bf16_system:
; GFX908: ; %bb.0:
-; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX908-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX908-NEXT: s_mov_b64 s[0:1], 0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_movk_i32 s4, 0x7fff
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x7fc0
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_and_b32 s2, s4, -4
-; GFX908-NEXT: s_mov_b32 s3, s5
-; GFX908-NEXT: s_load_dword s6, s[2:3], 0x0
-; GFX908-NEXT: s_and_b32 s4, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, s4, 3
-; GFX908-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX908-NEXT: s_not_b32 s5, s5
+; GFX908-NEXT: s_and_b32 s2, s6, -4
+; GFX908-NEXT: s_mov_b32 s3, s7
+; GFX908-NEXT: s_load_dword s7, s[2:3], 0x0
+; GFX908-NEXT: s_and_b32 s5, s6, 3
+; GFX908-NEXT: s_lshl_b32 s5, s5, 3
+; GFX908-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX908-NEXT: s_not_b32 s6, s6
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
+; GFX908-NEXT: v_mov_b32_e32 v2, s7
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s5, v1
-; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX908-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX908-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX908-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, s5, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s5, v2
; GFX908-NEXT: global_store_short v[0:1], v0, off
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: global_atomic_fadd_ret_bf16_system:
; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_movk_i32 s4, 0x7fff
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x7fc0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_and_b32 s2, s4, -4
-; GFX90A-NEXT: s_mov_b32 s3, s5
-; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x0
-; GFX90A-NEXT: s_and_b32 s4, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX90A-NEXT: s_not_b32 s5, s5
+; GFX90A-NEXT: s_and_b32 s2, s6, -4
+; GFX90A-NEXT: s_mov_b32 s3, s7
+; GFX90A-NEXT: s_load_dword s7, s[2:3], 0x0
+; GFX90A-NEXT: s_and_b32 s5, s6, 3
+; GFX90A-NEXT: s_lshl_b32 s5, s5, 3
+; GFX90A-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX90A-NEXT: s_not_b32 s6, s6
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
+; GFX90A-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX90A-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX90A-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s5, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v1, v[2:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s5, v2
; GFX90A-NEXT: global_store_short v[0:1], v0, off
; GFX90A-NEXT: s_endpgm
;
@@ -1815,7 +1867,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, s2, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, s4, v1
; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1846,13 +1903,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v1, s2, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v3, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1
; GFX11-NEXT: v_and_or_b32 v1, v2, s4, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 78db126fb2dc49..f494e5d7ab6bb9 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -3,10 +3,10 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
; We only care about which physical registers the parameters are copied from;
; the function bodies are just some arbitrary uses.
@@ -64,59 +64,113 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre
; GISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY12]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into `ptr poison`)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
- ; DAGISEL-GFX11-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
- ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
- ; DAGISEL-GFX10-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
- ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%c = add <4 x i32> %a, %b
store <4 x i32> %c, ptr poison
ret void
@@ -183,81 +237,157 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_ptr(ptr inreg
; GISEL-GFX10-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY15]], [[COPY11]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (p5) into %ir.b5, addrspace 5)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
- ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
- ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
- ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
- ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
- ; DAGISEL-GFX11-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
- ; DAGISEL-GFX11-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
- ; DAGISEL-GFX11-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; DAGISEL-GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
- ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
- ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
- ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
- ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
- ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
- ; DAGISEL-GFX10-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
- ; DAGISEL-GFX10-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; DAGISEL-GFX10-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; DAGISEL-GFX11-WF32-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; DAGISEL-GFX11-WF32-NEXT: SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; DAGISEL-GFX11-WF64-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; DAGISEL-GFX11-WF64-NEXT: SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; DAGISEL-GFX10-WF32-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; DAGISEL-GFX10-WF32-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; DAGISEL-GFX10-WF64-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3)
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; DAGISEL-GFX10-WF64-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
store ptr %a, ptr %b
store ptr addrspace(1) %a1, ptr addrspace(1) %b1
store ptr addrspace(3) %a3, ptr addrspace(3) %b3
@@ -346,119 +476,233 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr,
; GISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY22]], [[REG_SEQUENCE3]], 16, 0, implicit $exec :: (store (<4 x s32>) into `ptr addrspace(1) poison` + 16, addrspace 1)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
- ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
- ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
- ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
- ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
- ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
- ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
- ; DAGISEL-GFX11-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
- ; DAGISEL-GFX11-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
- ; DAGISEL-GFX11-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
- ; DAGISEL-GFX11-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
- ; DAGISEL-GFX11-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
- ; DAGISEL-GFX11-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
- ; DAGISEL-GFX11-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
- ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
- ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
- ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
- ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
- ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
- ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
- ; DAGISEL-GFX10-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
- ; DAGISEL-GFX10-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
- ; DAGISEL-GFX10-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
- ; DAGISEL-GFX10-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
- ; DAGISEL-GFX10-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
- ; DAGISEL-GFX10-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
- ; DAGISEL-GFX10-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%p = extractvalue {ptr, i32, <4 x i32>} %a, 0
%i = extractvalue {ptr, i32, <4 x i32>} %a, 1
%v = extractvalue {ptr, i32, <4 x i32>} %a, 2
@@ -497,29 +741,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_float(float in
; GISEL-GFX10-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_float
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_float
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_float
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%c = fadd float %a, %b
store float %c, ptr poison
ret void
@@ -552,29 +820,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_half
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_half
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%c = fadd half %a, %b
store half %c, ptr poison
ret void
@@ -607,33 +899,93 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
- ; DAGISEL-GFX11-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
- ; DAGISEL-GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
- ; DAGISEL-GFX10-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
- ; DAGISEL-GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_ADD3_U32_e64_]], implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_O_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32704
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[V_LSHRREV_B32_e64_]], killed [[V_CMP_O_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%c = fadd bfloat %a, %b
store bfloat %c, ptr poison
ret void
@@ -666,29 +1018,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%c = add i16 %a, %b
store i16 %c, ptr poison
ret void
@@ -787,101 +1163,197 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i
; GISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY27]], [[COPY25]], 16, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into `ptr poison` + 16, basealign 32)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
- ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
- ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
- ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
- ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
- ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
- ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
- ; DAGISEL-GFX11-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
- ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
- ; DAGISEL-GFX11-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
- ; DAGISEL-GFX11-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
- ; DAGISEL-GFX11-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
- ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
- ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
- ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
- ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
- ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
- ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
- ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
- ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
- ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
- ; DAGISEL-GFX10-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
- ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
- ; DAGISEL-GFX10-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
- ; DAGISEL-GFX10-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
- ; DAGISEL-GFX10-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
- ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
+ ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%c = add <16 x i16> %a, %b
store <16 x i16> %c, ptr poison
ret void
@@ -922,45 +1394,85 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i
; GISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (<2 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_many_regs
- ; DAGISEL-GFX11: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135
- ; DAGISEL-GFX11-NEXT: {{ $}}
- ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
- ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
- ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-NEXT: S_ENDPGM 0
- ;
- ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_many_regs
- ; DAGISEL-GFX10: bb.0 (%ir-block.0):
- ; DAGISEL-GFX10-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135
- ; DAGISEL-GFX10-NEXT: {{ $}}
- ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
- ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
- ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+ ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135
+ ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+ ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135
+ ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+ ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135
+ ; DAGISEL-GFX10-WF32-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs
+ ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135
+ ; DAGISEL-GFX10-WF64-NEXT: {{ $}}
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
%c = extractelement <36 x i32> %a, i32 35
store i32 %c, ptr addrspace(1) poison
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 3be4665cf3a005..27398ecb77bd56 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1398,61 +1398,75 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, -4, v0
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: ds_read_b32 v3, v1
+; VI-NEXT: ds_read_b32 v4, v1
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_mov_b32 s4, 0xffff
; VI-NEXT: v_and_b32_e32 v0, 24, v2
; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: v_mov_b32_e32 v3, 0x7fc0
; VI-NEXT: .LBB10_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, v3
-; VI-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_add_f32_e32 v3, 4.0, v3
-; VI-NEXT: v_and_b32_e32 v5, v4, v2
-; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v3, v5, v3
-; VI-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_lshrrev_b32_sdwa v4, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f32_e32 v4, 4.0, v4
+; VI-NEXT: v_bfe_u32 v7, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v3, v7, vcc
+; VI-NEXT: v_and_b32_e32 v6, v5, v2
+; VI-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; VI-NEXT: v_or_b32_e32 v4, v6, v4
+; VI-NEXT: ds_cmpst_rtn_b32 v4, v1, v5, v4
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
; VI-NEXT: s_cbranch_execnz .LBB10_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; VI-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: lds_atomic_fadd_ret_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX9-NEXT: ds_read_b32 v2, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX9-NEXT: ds_read_b32 v4, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_and_b32_e32 v0, 24, v3
-; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s4
-; GFX9-NEXT: v_not_b32_e32 v3, v3
+; GFX9-NEXT: v_and_b32_e32 v0, 24, v2
+; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; GFX9-NEXT: v_not_b32_e32 v2, v2
; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_movk_i32 s6, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc0
; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: v_lshrrev_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_add_f32_e32 v2, 4.0, v2
-; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX9-NEXT: v_add3_u32 v6, v6, v4, s6
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX9-NEXT: v_and_or_b32 v4, v5, v2, v4
+; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v1, v5, v4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: lds_atomic_fadd_ret_bf16:
@@ -1539,19 +1553,26 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: v_mov_b32_e32 v4, 0x7fc0
; VI-NEXT: .LBB11_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_add_f32_e32 v4, 4.0, v4
-; VI-NEXT: v_and_b32_e32 v5, v3, v2
-; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
+; VI-NEXT: v_lshrrev_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f32_e32 v5, 4.0, v5
+; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; VI-NEXT: v_and_b32_e32 v6, v3, v2
+; VI-NEXT: v_lshlrev_b32_e32 v5, v0, v5
+; VI-NEXT: v_or_b32_e32 v5, v6, v5
+; VI-NEXT: ds_cmpst_rtn_b32 v5, v1, v3, v5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v3, v4
+; VI-NEXT: v_mov_b32_e32 v3, v5
; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
; VI-NEXT: s_cbranch_execnz .LBB11_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1569,18 +1590,25 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4
; GFX9-NEXT: v_not_b32_e32 v2, v2
; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_movk_i32 s6, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc0
; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4
-; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX9-NEXT: v_lshrrev_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_add_f32_e32 v5, 4.0, v5
+; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v6, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, v0, v5
+; GFX9-NEXT: v_and_or_b32 v5, v3, v2, v5
+; GFX9-NEXT: ds_cmpst_rtn_b32 v5, v1, v3, v5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v3, v4
+; GFX9-NEXT: v_mov_b32_e32 v3, v5
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index d76bb48b4a82a3..377773be7d412e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -4237,57 +4237,99 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) {
; GFX9-LABEL: fma_shuffle_v2bf16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_movk_i32 s2, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc0
+; GFX9-NEXT: s_mov_b32 s3, 0x5040100
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3]
-; GFX9-NEXT: s_mov_b32 s0, 0x7060302
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[0:1]
+; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7
-; GFX9-NEXT: v_fma_f32 v0, v8, v4, v0
-; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
-; GFX9-NEXT: v_fma_f32 v1, v12, v4, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_fma_f32 v8, v9, v10, v8
+; GFX9-NEXT: v_fma_f32 v1, v9, v5, v1
+; GFX9-NEXT: v_fma_f32 v2, v13, v5, v2
+; GFX9-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX9-NEXT: v_fma_f32 v9, v13, v10, v12
+; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v5, v5, v8, s2
+; GFX9-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX9-NEXT: v_add3_u32 v10, v10, v1, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v12, v12, v9, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_add3_u32 v13, v13, v2, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
-; GFX9-NEXT: v_fma_f32 v0, v2, v10, v0
-; GFX9-NEXT: v_fma_f32 v2, v2, v5, v4
-; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1
-; GFX9-NEXT: v_fma_f32 v3, v3, v5, v7
-; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0
-; GFX9-NEXT: v_perm_b32 v0, v2, v0, s0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v12, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v13, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_fma_f32 v1, v3, v11, v1
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT: v_fma_f32 v2, v4, v11, v2
+; GFX9-NEXT: v_fma_f32 v4, v4, v6, v8
+; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2
+; GFX9-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX9-NEXT: v_add3_u32 v6, v6, v3, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX9-NEXT: v_add3_u32 v8, v8, v2, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_add3_u32 v9, v9, v4, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc
+; GFX9-NEXT: v_perm_b32 v2, v4, v2, s3
+; GFX9-NEXT: v_perm_b32 v1, v3, v1, s3
+; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fma_shuffle_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4296,73 +4338,164 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v9
; GFX10-NEXT: v_fmac_f32_e32 v0, v8, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_fmac_f32_e32 v11, v10, v4
-; GFX10-NEXT: v_fmac_f32_e32 v1, v10, v9
+; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9
+; GFX10-NEXT: v_fmac_f32_e32 v1, v11, v4
+; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_add3_u32 v9, v9, v10, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v11
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_fmac_f32_e32 v4, v2, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v11, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v12
-; GFX10-NEXT: v_fmac_f32_e32 v4, v3, v12
-; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v5
-; GFX10-NEXT: v_fmac_f32_e32 v7, v2, v5
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x7060302
-; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc0, v9, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v8, v8, v7, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
+; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fma_shuffle_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-NEXT: global_load_b64 v[2:3], v6, s[0:1]
-; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3]
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5]
+; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v3
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fmac_f32_e32 v1, v10, v9
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_fmac_f32 v1, v3, v5 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT: v_dual_fmac_f32 v11, v10, v4 :: v_dual_lshlrev_b32 v8, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_fmac_f32 v10, v11, v9 :: v_dual_and_b32 v7, 0xffff0000, v0
+; GFX11-NEXT: v_fmac_f32_e32 v1, v11, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9
+; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add3_u32 v9, v9, v10, 0x7fff
+; GFX11-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v11
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT: v_dual_fmac_f32 v4, v3, v12 :: v_dual_fmac_f32 v7, v2, v5
+; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v11, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_dual_cndmask_b32 v7, 0x7fc0, v9 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fmac_f32_e32 v1, v3, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_fmac_f32 v7, v3, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v12
-; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT: v_fmac_f32_e32 v4, v2, v5
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-NEXT: v_add3_u32 v8, v8, v7, 0x7fff
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x7060302
-; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc0, v3, vcc_lo
+; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index a9faa130d6379f..884860712632d7 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK,SM80 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
More information about the llvm-commits
mailing list