[llvm] ceb744e - [AMDGPU] Fix canonicalization of truncated values. (#83054)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 13 05:08:43 PDT 2024
Author: Harald van Dijk
Date: 2024-03-13T12:08:39Z
New Revision: ceb744eb2fa0895db1526110462745962fdf43c0
URL: https://github.com/llvm/llvm-project/commit/ceb744eb2fa0895db1526110462745962fdf43c0
DIFF: https://github.com/llvm/llvm-project/commit/ceb744eb2fa0895db1526110462745962fdf43c0.diff
LOG: [AMDGPU] Fix canonicalization of truncated values. (#83054)
We were relying on roundings to implicitly canonicalize, which is
generally safe, except with roundings that may be optimized away.
Fixes #82937.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/bf16.ll
llvm/test/CodeGen/AMDGPU/clamp.ll
llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 5ed82c0c4b1b8e..86f77f7b64e88d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -194,7 +194,25 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
}];
}
-class is_canonicalized<SDPatternOperator op> : PatFrag<
+class is_canonicalized_1<SDPatternOperator op> : PatFrag<
+ (ops node:$src0),
+ (op $src0),
+ [{
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
+
+ return Lowering.isCanonicalized(*CurDAG, N->getOperand(0));
+ }]> {
+
+ let GISelPredicateCode = [{
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+
+ return TLI->isCanonicalized(MI.getOperand(1).getReg(), MF);
+ }];
+}
+
+class is_canonicalized_2<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
[{
@@ -210,8 +228,8 @@ class is_canonicalized<SDPatternOperator op> : PatFrag<
const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
MF.getSubtarget().getTargetLowering());
- return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) &&
- TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF));
+ return TLI->isCanonicalized(MI.getOperand(1).getReg(), MF) &&
+ TLI->isCanonicalized(MI.getOperand(2).getReg(), MF);
}];
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9bc1b8eb598f3a..5ccf21f76015de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12572,6 +12572,10 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FREM:
case ISD::FP_ROUND:
case ISD::FP_EXTEND:
+ case ISD::FP16_TO_FP:
+ case ISD::FP_TO_FP16:
+ case ISD::BF16_TO_FP:
+ case ISD::FP_TO_BF16:
case ISD::FLDEXP:
case AMDGPUISD::FMUL_LEGACY:
case AMDGPUISD::FMAD_FTZ:
@@ -12591,6 +12595,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case AMDGPUISD::CVT_F32_UBYTE1:
case AMDGPUISD::CVT_F32_UBYTE2:
case AMDGPUISD::CVT_F32_UBYTE3:
+ case AMDGPUISD::FP_TO_FP16:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::COS_HW:
return true;
// It can/will be lowered or combined as a bit operation.
@@ -12600,6 +12607,20 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FCOPYSIGN:
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ case ISD::AND:
+ if (Op.getValueType() == MVT::i32) {
+ // Be careful as we only know it is a bitcast floating point type. It
+ // could be f32, v2f16, we have no way of knowing. Luckily the constant
+ // value that we optimize for, which comes up in fp32 to bf16 conversions,
+ // is valid to optimize for all types.
+ if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (RHS->getZExtValue() == 0xffff0000) {
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ }
+ }
+ }
+ break;
+
case ISD::FSIN:
case ISD::FCOS:
case ISD::FSINCOS:
@@ -12665,6 +12686,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
return false;
case ISD::BITCAST:
+ // TODO: This is incorrect as it loses track of the operand's type. We may
+ // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
+ // same bits that are canonicalized in one type need not be in the other.
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt
@@ -12694,25 +12718,26 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case Intrinsic::amdgcn_trig_preop:
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_sqrt:
return true;
default:
break;
}
- [[fallthrough]];
+ break;
}
default:
- // FIXME: denormalsEnabledForType is broken for dynamic
- return denormalsEnabledForType(DAG, Op.getValueType()) &&
- DAG.isKnownNeverSNaN(Op);
+ break;
}
- llvm_unreachable("invalid operation");
+ // FIXME: denormalsEnabledForType is broken for dynamic
+ return denormalsEnabledForType(DAG, Op.getValueType()) &&
+ DAG.isKnownNeverSNaN(Op);
}
-bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
+bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
unsigned MaxDepth) const {
- MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
MachineInstr *MI = MRI.getVRegDef(Reg);
unsigned Opcode = MI->getOpcode();
@@ -12931,27 +12956,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
}
}
- unsigned SrcOpc = N0.getOpcode();
-
- // If it's free to do so, push canonicalizes further up the source, which may
- // find a canonical source.
- //
- // TODO: More opcodes. Note this is unsafe for the _ieee minnum/maxnum for
- // sNaNs.
- if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
- auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
- if (CRHS && N0.hasOneUse()) {
- SDLoc SL(N);
- SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
- N0.getOperand(0));
- SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
- DCI.AddToWorklist(Canon0.getNode());
-
- return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
- }
- }
-
- return isCanonicalized(DAG, N0) ? N0 : SDValue();
+ return SDValue();
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -15939,8 +15944,8 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
}
}
-bool SITargetLowering::denormalsEnabledForType(LLT Ty,
- MachineFunction &MF) const {
+bool SITargetLowering::denormalsEnabledForType(
+ LLT Ty, const MachineFunction &MF) const {
switch (Ty.getScalarSizeInBits()) {
case 32:
return !denormalModeIsFlushAllF32(MF);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a20442e3737ee1..89da4428e3ab0a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -523,10 +523,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
- bool isCanonicalized(Register Reg, MachineFunction &MF,
+ bool isCanonicalized(Register Reg, const MachineFunction &MF,
unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
- bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const;
+ bool denormalsEnabledForType(LLT Ty, const MachineFunction &MF) const;
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
const TargetRegisterInfo *TRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 33c93cdf20c43b..3ab788406ecb28 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2944,6 +2944,34 @@ def : GCNPat<
(V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
(V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
+// If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
+let AddedComplexity = 1000 in {
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> f16:$src),
+ (COPY f16:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> v2f16:$src),
+ (COPY v2f16:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> f32:$src),
+ (COPY f32:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> v2f32:$src),
+ (COPY v2f32:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> f64:$src),
+ (COPY f64:$src)
+>;
+}
+
// Prefer selecting to max when legal, but using mul is always valid.
let AddedComplexity = -5 in {
@@ -3277,8 +3305,8 @@ def : GCNPat <
let AddedComplexity = 5 in {
def : GCNPat <
- (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
- (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
+ (v2f16 (is_canonicalized_2<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
+ (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
(V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
>;
}
@@ -3590,6 +3618,17 @@ FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
+class
+FPMinCanonMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : GCNPat <
+ (min_or_max (is_canonicalized_1<fcanonicalize>
+ (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
+ (VOP3Mods vt:$src1, i32:$src1_mods))),
+ (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
let OtherPredicates = [isGFX11Plus] in {
def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
@@ -3599,6 +3638,10 @@ def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
}
let OtherPredicates = [isGFX9Plus] in {
@@ -3612,6 +3655,10 @@ def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fmi
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
}
// Convert a floating-point power of 2 to the integer exponent.
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ebb77c13c4af7b..98658834e89784 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -16968,7 +16968,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -16977,7 +16977,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -17163,9 +17163,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -17174,9 +17174,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -17280,8 +17280,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -17293,8 +17291,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -17375,10 +17371,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v1, v1, v3
; GCN-NEXT: v_min_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -17396,10 +17388,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -17522,12 +17510,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v2, v2, v5
; GCN-NEXT: v_min_f32_e32 v1, v1, v4
; GCN-NEXT: v_min_f32_e32 v0, v0, v3
@@ -17551,12 +17533,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
@@ -17688,14 +17664,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v3, v3, v7
; GCN-NEXT: v_min_f32_e32 v2, v2, v6
; GCN-NEXT: v_min_f32_e32 v1, v1, v5
@@ -17725,14 +17693,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
@@ -17951,22 +17911,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v7, v7, v15
; GCN-NEXT: v_min_f32_e32 v6, v6, v14
; GCN-NEXT: v_min_f32_e32 v5, v5, v13
@@ -18020,22 +17964,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
@@ -18382,71 +18310,51 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_min_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_min_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_min_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_min_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_min_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_min_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_min_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_min_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_min_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_min_f32_e32 v5, v5, v21
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
@@ -18461,8 +18369,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_min_f32_e32 v4, v4, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -18474,21 +18380,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v3, v3, v19
; GCN-NEXT: v_min_f32_e32 v2, v2, v18
; GCN-NEXT: v_min_f32_e32 v1, v1, v17
; GCN-NEXT: v_min_f32_e32 v0, v0, v16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -18503,8 +18398,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -18513,14 +18409,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
@@ -18531,13 +18425,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -18560,13 +18454,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
@@ -18579,48 +18473,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_min_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
@@ -18634,6 +18494,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
@@ -19267,287 +19131,223 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_min_f32_e32 v31, v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_min_f32_e32 v30, v30, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_min_f32_e32 v29, v29, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_min_f32_e32 v28, v28, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_min_f32_e32 v27, v27, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_min_f32_e32 v26, v26, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_min_f32_e32 v25, v25, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_min_f32_e32 v24, v24, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_min_f32_e32 v23, v23, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_min_f32_e32 v22, v22, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_min_f32_e32 v21, v21, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_min_f32_e32 v20, v20, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_min_f32_e32 v19, v19, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_min_f32_e32 v18, v18, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_min_f32_e32 v17, v17, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_min_f32_e32 v16, v16, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_min_f32_e32 v15, v15, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_min_f32_e32 v14, v14, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_min_f32_e32 v13, v13, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_min_f32_e32 v12, v12, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_min_f32_e32 v11, v11, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_min_f32_e32 v10, v10, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_min_f32_e32 v9, v9, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_min_f32_e32 v8, v8, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_min_f32_e32 v7, v7, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_min_f32_e32 v6, v6, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_min_f32_e32 v5, v5, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_min_f32_e32 v4, v4, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_min_f32_e32 v3, v3, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_min_f32_e32 v2, v2, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_min_f32_e32 v1, v1, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -19590,322 +19390,258 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -21097,8 +20833,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -21110,8 +20844,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -21192,10 +20924,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v1, v1, v3
; GCN-NEXT: v_max_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -21213,10 +20941,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -21339,12 +21063,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v2, v2, v5
; GCN-NEXT: v_max_f32_e32 v1, v1, v4
; GCN-NEXT: v_max_f32_e32 v0, v0, v3
@@ -21368,12 +21086,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
@@ -21505,14 +21217,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v3, v3, v7
; GCN-NEXT: v_max_f32_e32 v2, v2, v6
; GCN-NEXT: v_max_f32_e32 v1, v1, v5
@@ -21542,14 +21246,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
@@ -21768,22 +21464,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v7, v7, v15
; GCN-NEXT: v_max_f32_e32 v6, v6, v14
; GCN-NEXT: v_max_f32_e32 v5, v5, v13
@@ -21837,22 +21517,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
@@ -22199,71 +21863,51 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_max_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_max_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_max_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_max_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_max_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_max_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_max_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_max_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_max_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_max_f32_e32 v5, v5, v21
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
@@ -22278,8 +21922,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_max_f32_e32 v4, v4, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -22291,21 +21933,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v3, v3, v19
; GCN-NEXT: v_max_f32_e32 v2, v2, v18
; GCN-NEXT: v_max_f32_e32 v1, v1, v17
; GCN-NEXT: v_max_f32_e32 v0, v0, v16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -22320,8 +21951,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -22330,14 +21962,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
@@ -22348,13 +21978,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -22377,13 +22007,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
@@ -22392,52 +22022,18 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_max_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
@@ -22451,6 +22047,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
@@ -23084,287 +22684,223 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_max_f32_e32 v31, v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_max_f32_e32 v30, v30, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_max_f32_e32 v29, v29, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_max_f32_e32 v28, v28, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_max_f32_e32 v27, v27, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_max_f32_e32 v26, v26, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_max_f32_e32 v25, v25, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_max_f32_e32 v24, v24, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_max_f32_e32 v23, v23, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_max_f32_e32 v22, v22, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_max_f32_e32 v21, v21, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_max_f32_e32 v20, v20, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_max_f32_e32 v19, v19, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_max_f32_e32 v18, v18, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_max_f32_e32 v17, v17, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_max_f32_e32 v16, v16, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_max_f32_e32 v15, v15, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_max_f32_e32 v14, v14, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_max_f32_e32 v13, v13, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_max_f32_e32 v12, v12, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_max_f32_e32 v11, v11, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_max_f32_e32 v10, v10, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_max_f32_e32 v9, v9, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_max_f32_e32 v8, v8, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_max_f32_e32 v7, v7, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_max_f32_e32 v6, v6, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_max_f32_e32 v5, v5, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_max_f32_e32 v4, v4, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_max_f32_e32 v3, v3, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_max_f32_e32 v2, v2, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_max_f32_e32 v1, v1, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -23407,322 +22943,258 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -25176,7 +24648,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26818,11 +26289,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GCN-LABEL: v_canonicalize_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_canonicalize_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_canonicalize_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index dfadd8d205b04e..947284506a2970 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2996,18 +2996,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
@@ -3095,16 +3093,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
+; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
@@ -3198,9 +3195,8 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3760,19 +3756,17 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0
+; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
+; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3863,18 +3857,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 4ed1b8a520b8b5..e1981972f58d1b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -471,25 +471,15 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
ret void
}
-; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GFX9: {{flat|global}}_store_dword
-define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
- %id = tail call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
- %load = load float, ptr addrspace(1) %gep, align 4
- %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
- %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
- store float %canonicalized, ptr addrspace(1) %gep, align 4
- ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
+; %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+; %load = load float, ptr addrspace(1) %gep, align 4
+; %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+; %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+; store float %canonicalized, ptr addrspace(1) %gep, align 4
+; ret void
+; }
; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
@@ -523,32 +513,15 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-
-; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
-
-; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
-
-; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
-; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
-
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
-
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
-define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
- %id = tail call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
- %load = load float, ptr addrspace(1) %gep, align 4
- %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
- %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
- store float %canonicalized, ptr addrspace(1) %gep, align 4
- ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
+; %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+; %load = load float, ptr addrspace(1) %gep, align 4
+; %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
+; %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+; store float %canonicalized, ptr addrspace(1) %gep, align 4
+; ret void
+; }
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
@@ -674,10 +647,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp
}
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
-; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
+; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]],
+; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]]
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
@@ -807,18 +779,13 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
ret half %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
-; GFX9: v_mul_f16_e32
-; GFX9: v_pk_mul_f16
-; GFX9-NOT: v_max
-; GFX9-NOT: v_pk_max
-define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
- %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
- %ins.op = fmul half %val, 8.0
- %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
- %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
- ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
+; %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
+; %ins.op = fmul half %val, 8.0
+; %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
+; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
+; ret <2 x half> %canonicalized
+; }
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
; GFX9: v_mul_f16
@@ -842,15 +809,11 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x
ret <2 x half> %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
-; GCN: s_waitcnt
-; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
- %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
- %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
- ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
+; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
+; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
+; ret <2 x half> %canonicalized
+; }
; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 274621307f540d..581b7b4cff9ed0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -94,7 +94,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -147,7 +146,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -170,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
ret void
}
+define half @s_test_canonicalize_arg(half %x) #1 {
+; VI-LABEL: s_test_canonicalize_arg:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_test_canonicalize_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; CI-LABEL: s_test_canonicalize_arg:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_test_canonicalize_arg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %canonicalized = call half @llvm.canonicalize.f16(half %x)
+ ret half %canonicalized
+}
+
define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
; VI: ; %bb.0:
@@ -242,7 +269,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -299,7 +325,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -357,7 +382,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -414,7 +438,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -471,7 +494,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -1246,9 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1323,9 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1404,9 +1422,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1485,9 +1501,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1551,9 +1565,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -2424,7 +2436,6 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
@@ -2456,8 +2467,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2738,7 +2748,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
@@ -2782,8 +2791,6 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
@@ -2826,13 +2833,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v2
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2878,18 +2882,18 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v6f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v6f16:
@@ -2933,22 +2937,22 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v8f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v8f16:
@@ -3001,30 +3005,30 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v12f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v12f16:
@@ -3087,38 +3091,38 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v16f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v16f16:
@@ -3216,68 +3220,68 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
-; CI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; CI-NEXT: v_cvt_f32_f16_e32 v30, v30
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
@@ -3456,228 +3460,354 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v1, v1, v2
; CI-NEXT: v_cvt_f16_f32_e32 v2, v4
; CI-NEXT: v_cvt_f16_f32_e32 v4, v5
; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
; CI-NEXT: v_cvt_f16_f32_e32 v7, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v6
; CI-NEXT: v_cvt_f16_f32_e32 v6, v10
; CI-NEXT: v_cvt_f16_f32_e32 v9, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v16
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v3, v4, v3
; CI-NEXT: v_cvt_f16_f32_e32 v4, v8
; CI-NEXT: v_cvt_f16_f32_e32 v8, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v26
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v4, v5, v4
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; CI-NEXT: v_cvt_f16_f32_e32 v6, v12
; CI-NEXT: v_or_b32_e32 v5, v7, v5
; CI-NEXT: v_cvt_f16_f32_e32 v7, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v22
; CI-NEXT: v_or_b32_e32 v6, v7, v6
; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v19
; CI-NEXT: v_or_b32_e32 v7, v9, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v18
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
-; CI-NEXT: v_or_b32_e32 v8, v9, v8
+; CI-NEXT: v_or_b32_e32 v8, v10, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v20
; CI-NEXT: v_or_b32_e32 v9, v11, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v19
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
-; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; CI-NEXT: v_or_b32_e32 v10, v11, v10
-; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v24
+; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
+; CI-NEXT: v_or_b32_e32 v10, v12, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v30
+; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; CI-NEXT: v_or_b32_e32 v11, v13, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v23
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v30
-; CI-NEXT: v_or_b32_e32 v12, v13, v12
-; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; CI-NEXT: v_or_b32_e32 v13, v15, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
+; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; CI-NEXT: v_or_b32_e32 v12, v15, v12
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v31
+; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128
+; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v15
; CI-NEXT: v_cvt_f16_f32_e32 v15, v27
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44
-; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v33
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_or_b32_e32 v13, v16, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v32
+; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; CI-NEXT: v_or_b32_e32 v14, v15, v14
-; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22
; CI-NEXT: v_or_b32_e32 v15, v25, v15
-; CI-NEXT: s_waitcnt vmcnt(11)
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: s_waitcnt vmcnt(10)
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v21
+; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64
+; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v16
+; CI-NEXT: v_or_b32_e32 v16, v24, v25
+; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27
+; CI-NEXT: v_or_b32_e32 v25, v28, v24
; CI-NEXT: s_waitcnt vmcnt(9)
; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
; CI-NEXT: s_waitcnt vmcnt(8)
; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; CI-NEXT: v_or_b32_e32 v16, v17, v16
-; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
-; CI-NEXT: v_or_b32_e32 v17, v19, v17
; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT: v_or_b32_e32 v20, v19, v20
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
; CI-NEXT: s_waitcnt vmcnt(6)
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v21
-; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v34
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT: v_or_b32_e32 v17, v17, v26
+; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
+; CI-NEXT: v_or_b32_e32 v18, v27, v18
+; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
+; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0
+; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0
+; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: v_cvt_f16_f32_e32 v20, v22
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v23
-; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; CI-NEXT: v_or_b32_e32 v18, v19, v18
-; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; CI-NEXT: v_or_b32_e32 v19, v21, v19
-; CI-NEXT: s_waitcnt vmcnt(3)
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v26
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v27
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84
+; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
+; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: s_waitcnt vmcnt(12)
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; CI-NEXT: v_or_b32_e32 v20, v21, v20
-; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
-; CI-NEXT: v_or_b32_e32 v21, v27, v21
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128
-; CI-NEXT: s_waitcnt vmcnt(5)
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0
+; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
+; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: s_waitcnt vmcnt(13)
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: s_waitcnt vmcnt(12)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v24
+; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; CI-NEXT: v_or_b32_e32 v20, v23, v20
+; CI-NEXT: s_waitcnt vmcnt(9)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v28
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: s_waitcnt vmcnt(4)
; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; CI-NEXT: v_or_b32_e32 v24, v25, v24
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT: v_or_b32_e32 v22, v22, v23
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88
-; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: v_or_b32_e32 v23, v27, v23
+; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
+; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_or_b32_e32 v17, v17, v18
+; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0
+; CI-NEXT: v_or_b32_e32 v25, v25, v26
+; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0
+; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_or_b32_e32 v19, v24, v19
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_or_b32_e32 v21, v22, v21
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: s_waitcnt vmcnt(4)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v22
+; CI-NEXT: v_or_b32_e32 v22, v23, v27
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52
+; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; CI-NEXT: v_or_b32_e32 v23, v28, v23
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48
+; CI-NEXT: s_waitcnt vmcnt(2)
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x74, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0
-; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT: v_or_b32_e32 v23, v23, v27
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_or_b32_e32 v24, v24, v27
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0
-; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64
-; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84
-; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80
-; CI-NEXT: s_waitcnt vmcnt(3)
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; CI-NEXT: v_or_b32_e32 v27, v28, v27
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
-; CI-NEXT: v_or_b32_e32 v23, v26, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0
-; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0
-; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0
-; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0
-; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0
-; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0
-; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0
-; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0
-; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0
-; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; CI-NEXT: v_or_b32_e32 v28, v29, v28
+; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0
+; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
+; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0
+; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0
+; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0
+; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0
+; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0
+; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0
; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index c1093a1e89c886..d53c0411ad88c1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2389,7 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2471,15 +2470,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX6-NEXT: flat_load_dword v0, v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX6-NEXT: flat_store_dword v[0:1], v4
@@ -2724,7 +2721,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2807,15 +2803,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX6-NEXT: flat_load_dword v0, v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX6-NEXT: flat_store_dword v[0:1], v4
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 78fb89c71e2e6a..b32630a97b3ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -951,8 +951,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1056,7 +1054,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1110,7 +1107,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1193,7 +1189,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1222,7 +1217,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1253,7 +1247,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1311,7 +1304,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1346,7 +1338,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1413,8 +1404,6 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1494,8 +1483,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1599,7 +1586,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1653,7 +1639,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1736,7 +1721,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1792,7 +1776,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1859,8 +1842,6 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3980,7 +3961,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
; SI-LABEL: v_fneg_canonicalize_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_canonicalize_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 17f67615c29f2e..b5440b9c38c9f2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -1021,7 +1021,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1043,7 +1042,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index ab7ab4de186142..d056a97dc54442 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @maxnum_f16(
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -170,7 +168,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -279,7 +276,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -384,21 +380,17 @@ define amdgpu_kernel void @maxnum_v2f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s2, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT: s_lshr_b32 s0, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_lshr_b32 s3, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, v2, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -497,20 +489,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -589,20 +579,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -688,27 +676,21 @@ define amdgpu_kernel void @maxnum_v3f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
-; SI-NEXT: s_lshr_b32 s3, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: s_lshr_b32 s8, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s8
; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, v1, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_max_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT: v_max_f32_e32 v1, v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v2, v3, v4
+; SI-NEXT: v_max_f32_e32 v0, v0, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -837,25 +819,17 @@ define amdgpu_kernel void @maxnum_v4f16(
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: s_lshr_b32 s6, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
; SI-NEXT: s_lshr_b32 s6, s5, 16
+; SI-NEXT: s_lshr_b32 s4, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_lshr_b32 s4, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s5
; SI-NEXT: v_max_f32_e32 v3, v3, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, v1, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, v2, v5
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v2, v2, v7
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_max_f32_e32 v1, v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_max_f32_e32 v0, v0, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -986,20 +960,16 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
; SI-NEXT: s_lshr_b32 s5, s5, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_max_f32_e32 v3, 2.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index b7370ce0fde1ab..f934a2de9247f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @minnum_f16_ieee(
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -197,7 +195,6 @@ define amdgpu_kernel void @minnum_f16_imm_a(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -305,7 +302,6 @@ define amdgpu_kernel void @minnum_f16_imm_b(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -409,21 +405,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s2, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT: s_lshr_b32 s0, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_lshr_b32 s3, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, v2, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -556,20 +548,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -647,20 +637,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -745,27 +733,21 @@ define amdgpu_kernel void @minnum_v3f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
-; SI-NEXT: s_lshr_b32 s3, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: s_lshr_b32 s8, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s8
; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_min_f32_e32 v2, v3, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, v1, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_min_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT: v_min_f32_e32 v1, v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v2, v3, v4
+; SI-NEXT: v_min_f32_e32 v0, v0, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -893,25 +875,17 @@ define amdgpu_kernel void @minnum_v4f16(
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: s_lshr_b32 s6, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
; SI-NEXT: s_lshr_b32 s6, s5, 16
+; SI-NEXT: s_lshr_b32 s4, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_lshr_b32 s4, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s5
; SI-NEXT: v_min_f32_e32 v3, v3, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, v1, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_min_f32_e32 v2, v2, v5
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v2, v2, v7
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_min_f32_e32 v1, v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_min_f32_e32 v0, v0, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -1041,20 +1015,16 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
; SI-NEXT: s_lshr_b32 s5, s5, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_min_f32_e32 v3, 2.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index fb3e79b2cf2934..5b7f0e72b70da5 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -951,56 +951,70 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v1, 0
; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v6, 0
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v2, v0, 0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, 0
+; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, 0
+; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7
; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp
; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5
+; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1
+; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3
+; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1
; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1139,63 +1153,80 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
}
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mov_b32_e32 v0, v6
-; GFX900-NEXT: v_mov_b32_e32 v1, v2
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_mov_b32_e32 v0, v6
-; GFX906-NEXT: v_mov_b32_e32 v1, v2
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9
; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8
-; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp
+; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v10
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v11
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5
+; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1
+; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2
+; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3
+; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00
+; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3
+; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v2
; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0
; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
@@ -1241,6 +1272,40 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v2
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v2
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
More information about the llvm-commits
mailing list